# Load dataset

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
data = pd.read_csv('Data_processed.csv')

X_train, X_test, y_train, y_test = train_test_split(data['Text_parsed'],                                                  
                                                    data['Category_target'], 
                                                    test_size=0.2, 
                                                    random_state=8)

In [3]:
len(X_train), len(X_test)

(48974, 12244)

In [4]:
categories = set(data['section'].values.tolist())
print(categories)

{'Technology', 'Education', 'Well', 'Theater', 'Your Money', 'Opinion', 'Global Business', 'Television', 'Health', 'Food', 'Travel', 'Real Estate', 'Books', 'Fashion', 'Dance', 'Music', 'Style', 'Art & Design', 'Movies', 'Science', 'Sports', 'Media', 'Economy', 'Automobiles'}


In [5]:
no_of_categories = len(categories)
print(no_of_categories)

24


# Vectorize data

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 2000

In [8]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
# print(features_train)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
# print(features_test.shape)


In [9]:
print(type(features_train))

<class 'numpy.ndarray'>


In [10]:
import numpy as np

np.save('features_train.npy', features_train)
np.save('features_test.npy', features_test)


In [None]:
np.save('labels_train.npy', np.array(labels_train))
np.save('labels_test.npy', np.array(labels_test))

In [11]:
features_test = np.load('features_test.npy')
features_train = np.load('features_train.npy')

# Train individual models

In [12]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle

#### 1. Naive Bayes: 
This is a probabilistic algorithm that is easy to implement and works well with text data. It's fast and requires less training data.

In [17]:
from sklearn.naive_bayes import GaussianNB


nb_model = GaussianNB()
nb_model.fit(features_train, labels_train)
model_predictions = nb_model.predict(features_test)
print('Accuracy: ', accuracy_score(labels_test, model_predictions))
print(classification_report(labels_test, model_predictions))

Accuracy:  0.34580202548186867
              precision    recall  f1-score   support

           0       0.80      0.27      0.40       594
           1       0.21      0.53      0.30       346
           2       0.62      0.15      0.24       636
           3       0.34      0.87      0.49       567
           4       0.22      0.44      0.30       346
           5       0.06      0.37      0.10       155
           6       0.72      0.14      0.24       613
           7       0.46      0.45      0.45       601
           8       0.14      0.73      0.23       214
           9       0.66      0.30      0.42       586
          10       0.71      0.23      0.35       609
          11       0.43      0.28      0.34       601
          12       0.81      0.43      0.56       635
          13       0.55      0.14      0.22       603
          14       0.67      0.35      0.46       621
          15       0.55      0.29      0.38       585
          16       0.74      0.55      0.63       

In [25]:
# save the model to disk
filename = './models/gaussian_nb_model.sav'
pickle.dump(nb_model, open(filename, 'wb'))

# # later, load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))

#### 2. Logistic Regression: 
This algorithm works well for binary and multi-class classification problems. It's a linear model that learns the relationships between the input features and the output labels.

In [20]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
print(lr_model.get_params())
lr_model.fit(features_train, labels_train)

lr_model_predictions = lr_model.predict(features_test)
print('\nAccuracy: ', accuracy_score(labels_test, lr_model_predictions), "\n")
print(classification_report(labels_test, lr_model_predictions))

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}

Accuracy:  0.7252531852335838 

              precision    recall  f1-score   support

           0       0.77      0.77      0.77       594
           1       0.80      0.78      0.79       346
           2       0.67      0.75      0.71       636
           3       0.91      0.90      0.90       567
           4       0.71      0.65      0.68       346
           5       0.64      0.56      0.60       155
           6       0.65      0.56      0.60       613
           7       0.73      0.78      0.75       601
           8       0.73      0.67      0.70       214
           9       0.72      0.72      0.72       586
          10       0.65      0.64      0.65       609
          11       0.77      0.81      0.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
# save the model to disk
filename = './models/lr_model.sav'
pickle.dump(lr_model, open(filename, 'wb'))

# # later, load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))

#### 3. Support Vector Machines (SVM): 
This is a powerful algorithm that can be used for both linear and non-linear classification problems. SVMs work by finding the best hyperplane that separates the data into different classes.

In [30]:
from sklearn.svm import SVC

# create SVM classifier
svm_model = SVC(kernel='linear')

# train the classifier on the training data
svm_model.fit(features_train, labels_train)

# make predictions on the testing data
svm_model_predictions = svm_model.predict(features_test)

# evaluate the accuracy of the classifier
print('\nAccuracy: ', accuracy_score(labels_test, svm_model_predictions), "\n")
print(classification_report(labels_test, svm_model_predictions))


Accuracy:  0.7174942829140804 

              precision    recall  f1-score   support

           0       0.73      0.76      0.75       594
           1       0.78      0.79      0.78       346
           2       0.67      0.73      0.70       636
           3       0.90      0.89      0.90       567
           4       0.68      0.66      0.67       346
           5       0.58      0.68      0.63       155
           6       0.61      0.54      0.57       613
           7       0.71      0.79      0.75       601
           8       0.69      0.68      0.69       214
           9       0.70      0.72      0.71       586
          10       0.64      0.63      0.63       609
          11       0.76      0.82      0.79       601
          12       0.83      0.82      0.82       635
          13       0.65      0.72      0.69       603
          14       0.75      0.80      0.77       621
          15       0.70      0.72      0.71       585
          16       0.87      0.87      0.87     

In [31]:
# save the model to disk
filename = './models/svm_model.sav'
pickle.dump(svm_model, open(filename, 'wb'))

# # later, load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))




#### 4. Random Forest: 
This is an ensemble learning algorithm that combines multiple decision trees to improve the accuracy of the classification. It works well for both text and non-text data.

In [32]:
from sklearn.ensemble import RandomForestClassifier

rf_model  = RandomForestClassifier(random_state=1)
rf_model.fit(features_train, labels_train)
rf_model_predictions = rf_model.predict(features_test)

# evaluate the accuracy of the classifier
print('\nAccuracy: ', accuracy_score(labels_test, rf_model_predictions), "\n")
print(classification_report(labels_test, rf_model_predictions))


Accuracy:  0.6534629206141783 

              precision    recall  f1-score   support

           0       0.68      0.77      0.72       594
           1       0.72      0.73      0.73       346
           2       0.58      0.69      0.63       636
           3       0.88      0.87      0.88       567
           4       0.66      0.57      0.61       346
           5       0.50      0.62      0.55       155
           6       0.59      0.52      0.56       613
           7       0.59      0.77      0.67       601
           8       0.55      0.48      0.51       214
           9       0.66      0.67      0.66       586
          10       0.60      0.50      0.54       609
          11       0.70      0.84      0.77       601
          12       0.79      0.76      0.77       635
          13       0.50      0.60      0.55       603
          14       0.62      0.74      0.67       621
          15       0.63      0.62      0.63       585
          16       0.72      0.84      0.77     

In [33]:
# save the model to disk
filename = './models/rf_model.sav'
pickle.dump(rf_model, open(filename, 'wb'))

# # later, load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))

## 5. Gradient Boosting: 
This is another ensemble learning algorithm that combines multiple weak learners to make a strong classifier. It's known for its high accuracy and ability to handle complex data.

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# create GradientBoostingClassifier for multi-class classification
gb_model = GradientBoostingClassifier(n_estimators=50, learning_rate=0.01, max_depth=3, random_state=1)

# train the classifier on the training data
gb_model.fit(features_train, labels_train)

# make predictions on the testing data
gb_model_predictions = gb_model.predict(features_test)

In [None]:

# evaluate the accuracy of the classifier
print('\nAccuracy: ', accuracy_score(labels_test, gb_model_predictions), "\n")
print(classification_report(labels_test, gb_model_predictions))

In [15]:

# evaluate the accuracy of the classifier
print('\nAccuracy: ', accuracy_score(labels_test, gb_model_predictions), "\n")
print(classification_report(labels_test, gb_model_predictions))


Accuracy:  0.4150604377654361 

              precision    recall  f1-score   support

           0       0.72      0.56      0.63       594
           1       0.75      0.40      0.52       346
           2       0.65      0.39      0.49       636
           3       0.90      0.74      0.81       567
           4       0.70      0.30      0.42       346
           5       0.61      0.35      0.44       155
           6       0.72      0.34      0.47       613
           7       0.80      0.36      0.49       601
           8       0.43      0.21      0.28       214
           9       0.63      0.37      0.46       586
          10       0.56      0.31      0.40       609
          11       0.64      0.69      0.66       601
          12       0.78      0.44      0.56       635
          13       0.50      0.34      0.40       603
          14       0.64      0.34      0.44       621
          15       0.74      0.27      0.39       585
          16       0.80      0.40      0.54     

In [16]:
# save the model to disk
filename = './models/gb_model.sav'
pickle.dump(gb_model, open(filename, 'wb'))

# # later, load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))

## 6. Convolutional Neural Networks (CNN): 
This algorithm uses deep learning techniques to learn features from the input text data. It's effective for text classification tasks that involve identifying patterns in the input.


In [23]:
import torch
import torch.nn as nn

# define the convolutional neural network
class TextCNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.conv1 = nn.Conv1d(in_channels=input_dim, out_channels=100, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=100, out_channels=100, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(in_channels=100, out_channels=100, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(100, output_dim)

    def forward(self, x):
        # x = x.permute(0, 2, 1)
        x = nn.functional.relu(self.conv1(x))
        x = nn.functional.relu(self.conv2(x))
        x = nn.functional.relu(self.conv3(x))
        x = self.pool(x).squeeze(-1)
        x = self.fc(x)
        return x

In [24]:
from tqdm import tqdm

# convert data to PyTorch tensors
X_train = torch.tensor(features_train, dtype=torch.float32)
y_train = torch.tensor(labels_train, dtype=torch.long)
X_test = torch.tensor(features_test, dtype=torch.float32)
y_test = torch.tensor(np.array(labels_test.values.tolist()), dtype=torch.long)

# define hyperparameters and model architecture
input_dim = X_train.shape[1]
output_dim = len(categories)
learning_rate = 3e-4
batch_size = 16
num_epochs = 10

# create an instance of the TextCNN model
cnn_model = TextCNN(input_dim, output_dim)

# define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn_model.parameters(), lr=learning_rate)

# train the model
def train(cnn_model):
    for epoch in tqdm(range(num_epochs), total=len(range(num_epochs))):
        train_loss, train_acc = 0.0, 0.0
        cnn_model.train()
        for i in range(0, X_train.size(0), batch_size):
            optimizer.zero_grad()
            batch_x = X_train[i:i+batch_size]
            batch_y = y_train[i:i+batch_size]
            y_pred = cnn_model(batch_x)
            loss = criterion(y_pred, batch_y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_acc += accuracy_score(batch_y.tolist(), torch.argmax(y_pred, dim=1).tolist())
        train_loss /= (X_train.size(0) // batch_size)
        train_acc /= (X_train.size(0) // batch_size)
        print(f"Train loss: {train_loss}, Train accuracy: {train_acc}")
        
    return cnn_model

# def evaluate():
#     # evaluate the model on the test set
#     with torch.no_grad():
#         model.eval()
#         test_loss, test_acc = 0.0, 0.
        


cnn_model = train(cnn_model)


  0%|          | 0/10 [00:00<?, ?it/s][A
Exception ignored in: <function tqdm.__del__ at 0x000001E148F41AF0>
Traceback (most recent call last):
  File "D:\Anaconda\lib\site-packages\tqdm\std.py", line 1162, in __del__
    self.close()
  File "D:\Anaconda\lib\site-packages\tqdm\std.py", line 1291, in close
    if self.last_print_t < self.start_t + self.delay:
AttributeError: 'tqdm' object has no attribute 'last_print_t'


RuntimeError: Given groups=1, weight of size [100, 2000, 3], expected input[1, 16, 2000] to have 2000 channels, but got 16 channels instead

In [18]:
# labels_test

24074    15
5107      0
12282    12
119      17
50268    10
         ..
5571     10
15788    16
1166     13
37356     6
39060     2
Name: Category_target, Length: 12244, dtype: int64