In [1]:
#To convert notebook to py file: ipynb-py-convert classification.ipynb classification.py

import torch
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.dummy import DummyClassifier
import spacy
from torch.autograd import Variable
from tqdm import tqdm

#Loading the core English Model.
#sp = spacy.load("en_core_web_sm")

In [2]:
'''
def spacy_tokenizer(sentence):
  sent = sp(sentence)
  tokens = []
  for token in sent:
      tokens.append(token.text)
  return tokens
'''

def TFIDF(data):
    vectorizer_x = TfidfVectorizer(max_features=2000)
    Data_TDM = vectorizer_x.fit_transform(data).toarray()
    return Data_TDM

In [3]:
train_df = pd.read_csv('train_custom.csv')
test_df = pd.read_csv('test_custom.csv')

In [4]:
#Majority Classifier Baseline

dummy_clf = DummyClassifier(strategy='most_frequent', random_state=0)
dummy_clf.fit(train_df['conversation'], train_df['category'])
print(f"Majority Classification Baseline = {dummy_clf.score(test_df['conversation'],test_df['category']) * 100} %")

Majority Classification Baseline = 19.459053343350863 %


In [5]:
X_train, X_test = TFIDF(train_df['conversation']), TFIDF(test_df['conversation'])
Y_train, Y_test = train_df['category'], test_df['category']

print(X_train.shape)
print(X_test.shape)

(14, 2000)
(1331, 2000)


In [6]:
class ClassifierNet(torch.nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.layer_1 = torch.nn.Linear(input_size,hidden_size, bias=True)
        self.relu = torch.nn.ReLU()
        self.layer_2 = torch.nn.Linear(hidden_size, hidden_size, bias=True)
        self.output_layer = torch.nn.Linear(hidden_size, num_classes, bias=True)
        #self.softmax = torch.nn.Softmax(dim=1)

     def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out

input_size =  X_train.shape[1] #THE INPUT TO THE NN SHOULD ALWAYS BE THE DIMENSION OF THE FEATURE VECTORS!!
hidden_size = 100
num_classes = 6
learning_rate = 0.01
num_epochs = 10

device = ("cuda:0" if torch.cuda.is_available() else "cpu")

net = ClassifierNet(input_size, hidden_size, num_classes)
net.to(device)

criterion = torch.nn.CrossEntropyLoss()  
criterion.to(device)

optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

print(net)

ClassifierNet(
  (layer_1): Linear(in_features=2000, out_features=100, bias=True)
  (relu): ReLU()
  (layer_2): Linear(in_features=100, out_features=100, bias=True)
  (output_layer): Linear(in_features=100, out_features=6, bias=True)
)


In [7]:
net.train()

for epoch in tqdm(range(num_epochs)):
    optimizer.zero_grad()  # zero the gradient buffer
    
    articles = Variable(torch.from_numpy(np.asarray(X_train)).float())
    
    labels = Variable(torch.from_numpy(np.asarray(Y_train)))
    
    outputs = net(articles.to(device))
    
    loss = criterion(outputs.to(device), labels.to(device))
    
    loss.backward()
    optimizer.step()

    print(f'Epoch: {epoch} | Training Loss: {loss.item()} ')

100%|██████████| 10/10 [00:00<00:00, 27.50it/s]

Epoch: 0 | Training Loss: 1.8412648439407349 
Epoch: 1 | Training Loss: 1.7636734247207642 
Epoch: 2 | Training Loss: 1.632741928100586 
Epoch: 3 | Training Loss: 1.402321457862854 
Epoch: 4 | Training Loss: 1.074174165725708 
Epoch: 5 | Training Loss: 0.7359715700149536 
Epoch: 6 | Training Loss: 0.4885139465332031 
Epoch: 7 | Training Loss: 0.31281206011772156 
Epoch: 8 | Training Loss: 0.19671079516410828 
Epoch: 9 | Training Loss: 0.12075690925121307 





In [10]:
# Test the Model
net.eval()

correct = 0
total = len(test_df)

articles = Variable(torch.from_numpy(np.asarray(X_test)).float())
labels = Variable(torch.from_numpy(np.asarray(Y_test)))

outputs = net(articles.to(device))

_, predicted = torch.max(outputs.data, 1)

correct = (predicted == labels).sum()

print(f'Accuracy of the network on the {total} test articles: {100 * correct / total} %')

tensor([4, 4, 4,  ..., 5, 5, 2])
tensor([5, 4, 2,  ..., 5, 5, 5])
Accuracy of the network on the 1331 test articles: 26 %
