Before running the code, make sure that this code, training data and test data are all in the same directory and the test data is named as "test.csv". This code takes around 10-15 minutes to run(excluding the time taken to download any library).

In [45]:
import nltk
# nltk.download('all')

In [46]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
from sklearn import model_selection, naive_bayes, svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping



In [47]:
np.random.seed(500)

In [48]:
data = pd.read_csv('train.csv', encoding='latin-1')

In [49]:
data

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,business,cars pull down us retail figures us retail sal...
2221,politics,kilroy unveils immigration policy ex-chatshow ...
2222,entertainment,rem announce new glasgow concert us band rem h...
2223,politics,how political squabbles snowball it s become c...


First, we will preprocess the data i.e remove those rows which contain empty entries, convert the word in text to lowercase.

In [50]:
data['text'].dropna(inplace=True)

In [51]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  2225 non-null   object
 1   text      2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


In [52]:
data['text'] = [entry.lower() for entry in data['text']]

Now for categorical labels, we use label encoding to convert the labels into numbers.

In [53]:
Encoder = LabelEncoder()
data['category'] = Encoder.fit_transform(data['category'])
data1 = data.copy()

In [54]:
category_counts = data['category'].value_counts()
print(category_counts)

#Since each class is in almost same amount, there is no need for balancing.

category
3    511
0    510
2    417
4    401
1    386
Name: count, dtype: int64


Now, we tokenize our dataset into words.

In [55]:
data['text'] = [word_tokenize(entry) for entry in data['text']]

In [56]:
data.head

<bound method NDFrame.head of       category                                               text
0            4  [tv, future, in, the, hands, of, viewers, with...
1            0  [worldcom, boss, left, books, alone, former, w...
2            3  [tigers, wary, of, farrell, gamble, leicester,...
3            3  [yeading, face, newcastle, in, fa, cup, premie...
4            1  [ocean, s, twelve, raids, box, office, ocean, ...
...        ...                                                ...
2220         0  [cars, pull, down, us, retail, figures, us, re...
2221         2  [kilroy, unveils, immigration, policy, ex-chat...
2222         1  [rem, announce, new, glasgow, concert, us, ban...
2223         2  [how, political, squabbles, snowball, it, s, b...
2224         3  [souness, delight, at, euro, progress, boss, g...

[2225 rows x 2 columns]>

Next step is lemmatizing so as to reduce the words into their root form and in the mean process, we also remove stop-words and numbers in the for loop.

In [57]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [58]:
print(tag_map)
word_Lemmatized = WordNetLemmatizer()
data

defaultdict(<function <lambda> at 0x000001544F9696C0>, {'J': 'a', 'V': 'v', 'R': 'r'})


Unnamed: 0,category,text
0,4,"[tv, future, in, the, hands, of, viewers, with..."
1,0,"[worldcom, boss, left, books, alone, former, w..."
2,3,"[tigers, wary, of, farrell, gamble, leicester,..."
3,3,"[yeading, face, newcastle, in, fa, cup, premie..."
4,1,"[ocean, s, twelve, raids, box, office, ocean, ..."
...,...,...
2220,0,"[cars, pull, down, us, retail, figures, us, re..."
2221,2,"[kilroy, unveils, immigration, policy, ex-chat..."
2222,1,"[rem, announce, new, glasgow, concert, us, ban..."
2223,2,"[how, political, squabbles, snowball, it, s, b..."


In [59]:
for index, entry in enumerate(data['text']):
    Final_words = []
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word, pos = tag_map[tag[0]])
            Final_words.append(word_Final)
    data.loc[index,'text_final'] = str(Final_words)

In [60]:
data

Unnamed: 0,category,text,text_final
0,4,"[tv, future, in, the, hands, of, viewers, with...","['tv', 'future', 'hand', 'viewer', 'home', 'th..."
1,0,"[worldcom, boss, left, books, alone, former, w...","['worldcom', 'bos', 'leave', 'book', 'alone', ..."
2,3,"[tigers, wary, of, farrell, gamble, leicester,...","['tiger', 'wary', 'farrell', 'gamble', 'leices..."
3,3,"[yeading, face, newcastle, in, fa, cup, premie...","['yeading', 'face', 'newcastle', 'fa', 'cup', ..."
4,1,"[ocean, s, twelve, raids, box, office, ocean, ...","['ocean', 'twelve', 'raid', 'box', 'office', '..."
...,...,...,...
2220,0,"[cars, pull, down, us, retail, figures, us, re...","['car', 'pull', 'u', 'retail', 'figure', 'u', ..."
2221,2,"[kilroy, unveils, immigration, policy, ex-chat...","['kilroy', 'unveils', 'immigration', 'policy',..."
2222,1,"[rem, announce, new, glasgow, concert, us, ban...","['rem', 'announce', 'new', 'glasgow', 'concert..."
2223,2,"[how, political, squabbles, snowball, it, s, b...","['political', 'squabble', 'snowball', 'become'..."


In [61]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(data['text_final'], data['category'], test_size = 0.3, random_state=75)
#Splitting the dataset into training and validation (testing) set

Now to extract the features, I will use 2 methods and compare their accuracies:
1) Tf-Idf - Term Freqeuncy-Inverse Document Frequency
2) word2vec

In [62]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=data['text'], vector_size=100, window=5, min_count=1, workers=4)

# Function to average word vectors for each document
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    n_words = 0
    for word in words:
        if word in vocabulary:
            n_words += 1
            feature_vector = np.add(feature_vector, model.wv[word])
    if n_words:
        feature_vector = np.divide(feature_vector, n_words)
    return feature_vector

# Get Word2Vec vocabulary
w2v_vocab = set(word2vec_model.wv.index_to_key)

# Convert train and test sets to averaged Word2Vec vectors
train_x_w2v = [average_word_vectors(words, word2vec_model, w2v_vocab, 100) for words in train_x]
test_x_w2v = [average_word_vectors(words, word2vec_model, w2v_vocab, 100) for words in test_x]

In [63]:
Tfidf_vect = TfidfVectorizer(max_features=10000)
Tfidf_vect.fit(data['text_final'])

In [64]:
train_x_Tfidf = Tfidf_vect.transform(train_x)
test_x_Tfidf = Tfidf_vect.transform(test_x)

After extracting the useful tokens, now we implement various models for the purpose of training the model for multiclass text classification and note their accuracies.
Various architectures which we will see are:
1) Decision Tree
2) Logistic Regression
3) Naive Bayes
4) Linear SVM
5) Deep Learning techniques with LSTMs

In [65]:
# Create and train the Decision Tree classifier
clf = DecisionTreeClassifier()
clf.fit(train_x_Tfidf, train_y)

# Predict labels on the test set
pred_DT = clf.predict(test_x_Tfidf)

# Calculate accuracy
accuracy_DT = accuracy_score(test_y, pred_DT)
print("Decision Tree Accuracy Score -> ", accuracy_DT * 100)

Decision Tree Accuracy Score ->  83.53293413173652


In [81]:
X = data1['text']
y = data1['category']

# Tokenization and sequence padding
tokenizer = Tokenizer(num_words=10000)  # Assuming a vocabulary size of 10,000
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
X = pad_sequences(sequences, maxlen=100)  # Assuming a maximum sequence length of 100

# Train-test split
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=75)

# Model architecture
model = Sequential()
model.add(Embedding(10000, 128, input_length=100))
model.add(LSTM(128))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(Encoder.classes_), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Train the model
history = model.fit(X_train, y_train, epochs=8, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy with DNN: {accuracy}")

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Test Accuracy with DNN: 0.8772454857826233


In [67]:
NB = naive_bayes.MultinomialNB()
NB.fit(train_x_Tfidf,train_y)

# predict the labels on validation dataset
pred_NB = NB.predict(test_x_Tfidf)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(pred_NB, test_y)*100)

Naive Bayes Accuracy Score ->  96.55688622754491


In [68]:
# Logistic Regression Model
logistic_reg = LogisticRegression(max_iter=1000)  # Initialize Logistic Regression
logistic_reg.fit(train_x_Tfidf, train_y)  # Train the model

# Predictions
y_pred_tfidf = logistic_reg.predict(test_x_Tfidf)

# Calculate Accuracy
accuracy = accuracy_score(test_y, y_pred_tfidf)
print(f"Logistic Regression Accuracy (Tf-Idf)-> {accuracy * 100}")

# Initialize and train Logistic Regression using Word2Vec representations
logistic_reg_w2v = LogisticRegression(max_iter=1000)
logistic_reg_w2v.fit(train_x_w2v, train_y)

# Predict on the test set using Word2Vec representations
pred_logistic_w2v = logistic_reg_w2v.predict(test_x_w2v)

# Calculate accuracy using accuracy_score
accuracy_logistic_w2v = accuracy_score(pred_logistic_w2v, test_y)
print("Logistic Regression Accuracy Score (Word2Vec) -> ", accuracy_logistic_w2v * 100)

Logistic Regression Accuracy (Tf-Idf)-> 97.75449101796407
Logistic Regression Accuracy Score (Word2Vec) ->  35.778443113772454


In [69]:
accuracy_tfidf = accuracy_score(test_y, y_pred_tfidf)
precision_tfidf = precision_score(test_y, y_pred_tfidf, average='weighted')
recall_tfidf = recall_score(test_y, y_pred_tfidf, average='weighted')
f1_tfidf = f1_score(test_y, y_pred_tfidf, average='weighted')

print("Logistic Regression Metrics (Tf-Idf):")
print(f"Accuracy: {accuracy_tfidf * 100:}%")
print(f"Precision: {precision_tfidf:}")
print(f"Recall: {recall_tfidf:}")
print(f"F1-score: {f1_tfidf:}")

accuracy_word2vec = accuracy_score(test_y, pred_logistic_w2v)
precision_word2vec = precision_score(test_y, pred_logistic_w2v, average='weighted')
recall_word2vec = recall_score(test_y, pred_logistic_w2v, average='weighted')
f1_word2vec = f1_score(test_y, pred_logistic_w2v, average='weighted')

print("\nLogistic Regression Metrics (Word2Vec):")
print(f"Accuracy: {accuracy_word2vec * 100:}%")
print(f"Precision: {precision_word2vec:}")
print(f"Recall: {recall_word2vec:}")
print(f"F1-score: {f1_word2vec:}")

Logistic Regression Metrics (Tf-Idf):
Accuracy: 97.75449101796407%
Precision: 0.9778055994182709
Recall: 0.9775449101796407
F1-score: 0.9775462856918494

Logistic Regression Metrics (Word2Vec):
Accuracy: 35.778443113772454%
Precision: 0.36702628047354613
Recall: 0.35778443113772457
F1-score: 0.3367197035533312


In [70]:
SVM = svm.SVC(C = 1, kernel = 'linear', degree = 3, gamma = 'auto')
SVM.fit(train_x_Tfidf, train_y)
pred_SVM = SVM.predict(test_x_Tfidf)

# Save the SVM model with (tf-idf feature engineering) to a file
joblib.dump(SVM, 'final_model.pkl')


# Initialize and train SVM using Word2Vec representations
svm_classifier = svm.SVC(C = 1, kernel = 'linear', degree = 3, gamma = 'auto')  # You can choose different kernels like 'rbf' or 'poly'
svm_classifier.fit(train_x_w2v, train_y)

# Predict on the test set using Word2Vec representations
pred_svm = svm_classifier.predict(test_x_w2v)


In [71]:
# Evaluate SVM model trained on Tf-Idf representations
accuracy_tfidf = accuracy_score(pred_SVM, test_y)
precision_tfidf = precision_score(pred_SVM, test_y, average='weighted')
recall_tfidf = recall_score(pred_SVM, test_y, average='weighted')
f1_tfidf = f1_score(pred_SVM, test_y, average='weighted')

print("SVM Metrics (Tf-Idf):")
print(f"Accuracy: {accuracy_tfidf * 100:}%")
print(f"Precision: {precision_tfidf:}")
print(f"Recall: {recall_tfidf:}")
print(f"F1-score: {f1_tfidf:}")

# Evaluate SVM model trained on Word2Vec representations
accuracy_word2vec = accuracy_score(pred_svm, test_y)
precision_word2vec = precision_score(pred_svm, test_y, average='weighted')
recall_word2vec = recall_score(pred_svm, test_y, average='weighted')
f1_word2vec = f1_score(pred_svm, test_y, average='weighted')

print("\nSVM Metrics (Word2Vec):")
print(f"Accuracy: {accuracy_word2vec * 100:}%")
print(f"Precision: {precision_word2vec:}")
print(f"Recall: {recall_word2vec:}")
print(f"F1-score: {f1_word2vec:}")

SVM Metrics (Tf-Idf):
Accuracy: 98.80239520958084%
Precision: 0.9881159949253597
Recall: 0.9880239520958084
F1-score: 0.9880320996705151

SVM Metrics (Word2Vec):
Accuracy: 31.58682634730539%
Precision: 0.6575845930235742
Recall: 0.3158682634730539
F1-score: 0.4073679805725062


  _warn_prf(average, modifier, msg_start, len(result))


So, out of all the architectures tried above, Linear SVM performs the best. Also, we can see that Tf-Idf performs much better than word2vec. Hence, our final model will be Linear SVM with Tf-Idf feature extraction method.

Now, we will evaluate the final model trained on train data on the test data. Just place the test data in the same directory as that of code and train data and make sure that it's named as "test.csv".

In [72]:
test_data = pd.read_csv('test.csv', encoding='latin-1')

Preprocess the test data in the same way as training data.

In [73]:
test_data['text'].dropna(inplace=True)

In [74]:
test_data['text'] = [entry.lower() for entry in test_data['text']]

In [75]:
# Use the same encoder as that of training data
test_data['category'] = Encoder.transform(test_data['category'])

In [76]:
test_data['text'] = [word_tokenize(entry) for entry in test_data['text']]

In [77]:
for index, entry in enumerate(test_data['text']):
    Final_words = []
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word, pos = tag_map[tag[0]])
            Final_words.append(word_Final)
    test_data.loc[index,'text_final'] = str(Final_words)

In [78]:
# Vectorize the preprocessed test data using the trained TF-IDF vectorizer
test_x = Tfidf_vect.transform(test_data['text_final'])
test_y = test_data['category']

In [79]:
# Load the saved SVM model from file
loaded_SVM = joblib.load('final_model.pkl')
predictions = loaded_SVM.predict(test_x)

In [80]:
# Evaluate SVM model on test data
accuracy_tfidf = accuracy_score(predictions, test_y)
precision_tfidf = precision_score(predictions, test_y, average='weighted')
recall_tfidf = recall_score(predictions, test_y, average='weighted')
f1_tfidf = f1_score(predictions, test_y, average='weighted')

print("SVM Metrics on Test Data:")
print(f"Accuracy: {accuracy_tfidf * 100:}%")
print(f"Precision: {precision_tfidf:}")
print(f"Recall: {recall_tfidf:}")
print(f"F1-score: {f1_tfidf:}")

SVM Metrics on Test Data:
Accuracy: 99.59550561797754%
Precision: 0.9959614752970547
Recall: 0.9959550561797753
F1-score: 0.9959555264488154
