# Small Text Classifier

ML model that classifies short text (e.g., tweets, news headlines)

Input: Training data in tabular form (e.g., Excel file) with two columns: Text, Tag 
Output: A Dataframe with the texts and their classes

Notes: Count Vectorizer achieves greater accuracy than TF-IDF.

In [None]:
import datetime
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

import torch
from torch import nn
from torch import optim

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
torch.__version__

# Parameters

Pick the tags for classification (e.g., economy, lifestyle, war). 

In [None]:
tags = ['write your tags here']
tagIds = {0:'id your tags to beautify printing'}

# Read training data

In [None]:
df = pd.read_excel(r'....xlsx', usecols = ['Text', 'Tag'])
df = df.dropna().drop_duplicates()
df

# Encode tags

In [None]:
le = LabelEncoder().fit(df['Tag'])
df['EncodedTag'] = le.transform(df['Tag'])
df

# Training Data Statistics

In [None]:
counts = {}
for index, row in df.iterrows():
    if row['Tag'] not in counts:
        counts[row['Tag']] = 0
    counts[row['Tag']] += 1

print('Data set size:', sum(counts.values()))
    
fig1, ax1 = plt.subplots()
ax1.pie(counts.values(), labels = counts.keys(), autopct='%1.1f%%')
fig1.set_size_inches(6, 6)
plt.show()

# Split data for training

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df['Headline'], df['EncodedTag'], test_size=.2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

# Vectorize training data

In [None]:
vectorizer = CountVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test) 

x_train = torch.tensor(scipy.sparse.csr_matrix.todense(x_train)).float()
x_test = torch.tensor(scipy.sparse.csr_matrix.todense(x_test)).float()

y_train = torch.tensor(y_train.values)
y_test = torch.tensor(y_test.values)

# Define the neural network

In [None]:
neurons = 64

model = nn.Sequential(nn.Linear(x_train.shape[1], neurons),
                      nn.ReLU(),
                      #nn.Dropout(0.1),
                      nn.Linear(neurons, df['Tag'].nunique()),
                      nn.LogSoftmax(dim = 1))


criterion = nn.NLLLoss()
logps = model(x_train.type(torch.FloatTensor)) 
loss = criterion(logps, y_train.type(torch.LongTensor)) 
loss.backward()
optimizer = optim.Adam(model.parameters(), lr=0.002) 

# Training

In [None]:
%%time

train_losses = []
test_losses = []
test_accuracies = []

epochs = 50
for e in range(epochs):
    optimizer.zero_grad()

    output = model.forward(x_train.type(torch.FloatTensor))
    loss = criterion(output, y_train.type(torch.LongTensor))
    loss.backward()
    train_loss = loss.item()
    train_losses.append(train_loss)
    
    optimizer.step()

    with torch.no_grad():
        model.eval()
        log_ps = model(x_test.type(torch.FloatTensor))
        test_loss = criterion(log_ps, y_test.type(torch.LongTensor))
        test_losses.append(test_loss)

        ps = torch.exp(log_ps)
        top_p, top_class = ps.topk(1, dim=1)
        equals = top_class == y_test.view(*top_class.shape)
        test_accuracy = torch.mean(equals.float())
        test_accuracies.append(test_accuracy)

    model.train()
    print(f"Epoch: {e+1}/{epochs}.. ", f"Training Loss: {train_loss:.3f}.. ", f"Test Loss: {test_loss:.3f}.. ", f"Test Accuracy: {test_accuracy:.3f}")

plt.figure(figsize=(12, 5))
ax = plt.subplot(121)
plt.xlabel('Epochs')
plt.ylabel('Negative log likelihood loss')
plt.plot(train_losses, label='Training loss')
plt.plot(test_losses, label='Validation loss')
plt.legend(frameon=False);
plt.subplot(122)
plt.xlabel('Epochs')
plt.ylabel('Test accuracy')
plt.plot(test_accuracies);

# Test the model with another dataset

In [None]:
df = pd.read_excel(r'....xlsx')

x = vectorizer.transform([i[0] for i in df.values.tolist()])
x = torch.tensor(scipy.sparse.csr_matrix.todense(x)).float()
output = model.forward(x.type(torch.FloatTensor))

outputTags = []
for i, o in enumerate(output):
    outputTags.append(tagIds[o.argmax().item()])
df ['Tags'] = outputTags
df