### Import libraries

In [None]:
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import qc, stopwords
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.utils.class_weight import compute_class_weight
from scipy.stats import uniform
from xgboost import XGBClassifier

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from scipy.sparse import csr_matrix
import urllib.request
%matplotlib inline

import transformers
from transformers import MobileBertTokenizer, MobileBertModel

### Download question classification dataset and explore

The question classification dataset contains questions labeled into 50 classes, which can be categorized into six main categories. 

- ABBR: Denotes abbreviations
- ENTY: Stands for entities
- DESC: Denotes descriptions and abstract concepts
- HUM: Denotes human beings
- LOC: Denotes locations
- NUM: Stands for numeric values

In [None]:
# Download the Question Classification dataset
nltk.download('qc')
train_tuples = qc.tuples("train.txt")
test_tuples = qc.tuples("test.txt")

In [None]:
len(train_tuples), len(test_tuples)
# (5452, 500)

In [None]:
train_tuples[0]
# ('DESC:manner', 'How did serfdom develop in and then leave Russia ?')

In [None]:
# convert data into DataFrame
train_df = pd.DataFrame(train_tuples, columns=['full_label', 'text'])
test_df = pd.DataFrame(test_tuples, columns=['full_label', 'text'])
train_df.head()

In [None]:
# split label into main category and the granular category
train_df[['main_cat', 'gran_cat']] = train_df['full_label'].str.split(':', expand=True)
test_df[['main_cat', 'gran_cat']] = test_df['full_label'].str.split(':', expand=True)
train_df.head()

In [None]:
# How many categories do they have?
print("***********train***********")
print(f'no of unique classes: {len(train_df["full_label"].unique())}')
print(f'no of  main classes: {len(train_df["main_cat"].unique())}')
print(f'no of granular  classes: {len(train_df["gran_cat"].unique())}')
print("***********test***********")
print(f'no of unique classes: {len(test_df["full_label"].unique())}')
print(f'no of  main classes: {len(test_df["main_cat"].unique())}')
print(f'no of granular  classes: {len(test_df["gran_cat"].unique())}')

# See the distribution of the examples.
print('***********train***********')
print(train_df["main_cat"].value_counts())
print('***********test***********')
print(test_df["main_cat"].value_counts())

# Are all the training labels present in the test set and vice versa?
all_unique_values = set(train_df["full_label"].unique()) | set(test_df["full_label"].unique())
len(all_unique_values)

# ***********train***********
# no of unique classes: 50
# no of  main classes: 6
# no of granular  classes: 47
# ***********test***********
# no of unique classes: 42
# no of  main classes: 6
# no of granular  classes: 39
# ***********train***********
# main_cat
# ENTY    1250
# HUM     1223
# DESC    1162
# NUM      896
# LOC      835
# ABBR      86
# Name: count, dtype: int64
# ***********test***********
# main_cat
# DESC    138
# NUM     113
# ENTY     94
# LOC      81
# HUM      65
# ABBR      9
# Name: count, dtype: int64
# 50

### Process text

- make copy
- convert text to lowercase
- remove punctuation
- remove stop words
- label encoder

In [None]:
# make copy of dataframes
train_orig = train_df.copy()
test_orig = test_df.copy()

In [None]:
# convert to lowercase
train_df['text'] = train_df['text'].str.lower()
test_df['text'] = test_df['text'].str.lower()

In [None]:
# remove punctuation
train_df['text'] = train_df['text'].str.replace(f'[{string.punctuation}]', '')
test_df['text'] = test_df['text'].str.replace(f'[{string.punctuation}]', '')

In [None]:
# remove stop words
nltk.download('stopwords')
stop_words = stopwords.words('english')
print(stop_words)
print(f'num of stop words: {len(stop_words)}')

# ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
# num of stop words: 179

In [None]:
remove_list = ["which", "who", "why", "how", "what", "when", "where", "whom"]
stop_words = [word for word in stop_words if word not in remove_list]
print(f'num of stop words: {len(stop_words)}')

# num of stop words: 171

In [None]:
# create a function that takes text and a stop word list and remove the stopwords
def remove_stop_words(text, stop_words):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [None]:
# apply the function to datasets
train_df['text'] = train_df['text'].apply(lambda x: remove_stop_words(x, stop_words))
test_df['text'] = test_df['text'].apply(lambda x: remove_stop_words(x, stop_words))

In [None]:
# create an instance of labelEncoder
le = LabelEncoder()

# fit label
le.fit(train_orig['main_cat'])

# get the mapping
label_mapping = {label: encoded_label for label, encoded_label in zip(le.classes_, le.transform(le.classes_))}
print(label_mapping)

# {'ABBR': 0, 'DESC': 1, 'ENTY': 2, 'HUM': 3, 'LOC': 4, 'NUM': 5}

In [None]:
# use le.transform to encode the labels
train_df['main_cat'] = le.transform(train_df['main_cat'])
test_df['main_cat'] = le.transform(test_df['main_cat'])

In [None]:
train_df.head()

# 	full_label	text	main_cat	gran_cat
# 0	DESC:manner	how serfdom develop leave russia ?	1	manner
# 1	ENTY:cremat	what films featured character popeye doyle ?	2	cremat
# 2	DESC:manner	how find list celebrities ' real names ?	1	manner
# 3	ENTY:animal	what fowl grabs spotlight chinese year monkey ?	2	animal
# 4	ABBR:exp	what full form .com ?	0	exp

### Split the data

In [None]:
# obtain the features and labels
X = train_df['text']
y = train_df['main_cat']
X_test = test_df['text']
y_test = test_df['main_cat']

In [None]:
# split train set
X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size = 0.15,
    random_state = 42,
    stratify = y # adjust percentage by y
)

In [None]:
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of X_valid: {X_valid.shape}')
print(f'Shape of X_test: {X_test.shape}')

total_size = len(X_train)+len(X_valid)+len(X_test)
print(f'total dataset size: {total_size}')
print(f'train %: {len(X_train)/total_size*100}')
print(f'valid %: {len(X_valid)/total_size*100}')
print(f'test %: {len(X_test)/total_size*100}')

# Shape of X_train: (4634,)
# Shape of X_valid: (818,)
# Shape of X_test: (500,)
# total dataset size: 5952
# train %: 77.85618279569893
# valid %: 13.743279569892472
# test %: 8.400537634408602

### Extract Features (BoW)

BoW模型是"Bag of Words"（词袋模型）的缩写，是自然语言处理（NLP）中常用的一种表示文本的方法。在这个模型中，文本被看作是一个由单词组成的集合，忽略了单词出现的顺序和语法结构，只关注单词的频率信息。具体来说，BoW模型将文本表示为一个由单词构成的向量，向量的每个维度代表一个单词，在这个维度上的值代表了对应单词在文本中出现的次数或者其他统计信息（比如TF-IDF值）。

BoW模型简单易懂，适用于许多NLP任务，比如文本分类、情感分析、信息检索等。然而，由于忽略了单词的顺序和语义信息，BoW模型在处理含有语义复杂性的文本时可能会失去一些重要的信息。

n-grams是一种用于从文本中提取特征的方法，它将文本分成长度为n的连续单词序列。n-grams可用于语言建模、文本分类、信息检索等自然语言处理任务中。

常见的n-grams包括：

1. **Unigrams (n=1)**：单个单词组成的序列。
2. **Bigrams (n=2)**：由两个相邻单词组成的序列。
3. **Trigrams (n=3)**：由三个相邻单词组成的序列。
4. **4-grams, 5-grams, ...**：依此类推，由n个相邻单词组成的序列。

例如，对于句子："The cat sat on the mat"，该句子的bigrams为：

- "The cat"
- "cat sat"
- "sat on"
- "on the"
- "the mat"

n-grams可以捕捉到文本中更多的局部信息，相较于词袋模型（Bag of Words），它保留了一定的顺序信息。在文本处理任务中，n-grams经常与词袋模型一起使用，作为文本特征的一部分。

In [None]:
# init Countvectorizer with 1- and 2- grams
count_vect = CountVectorizer(ngram_range=(1, 2), max_features=8000)

In [None]:
# fit the CountVectorizer
X_train_counts = count_vect.fit_transform(X_train)

# transform on valid and test data
X_valid_counts = count_vect.transform(X_valid)
X_test_counts = count_vect.transform(X_test)

In [None]:
print(X_train_counts.shape)
# Calculate the frequencies
word_frequencies = np.array(X_train_counts.sum(axis=0))[0]
print(word_frequencies.shape)
# Get the indexes that would sort the word frequencies
sorted_indexes = np.argsort(word_frequencies)
print(sorted_indexes.shape)
# Get the vocabulary words corresponding to the indexes
vocabulary_words = np.array(count_vect.get_feature_names_out())
print(vocabulary_words.shape)
# Get the top and bottom frequent words
top_10_tokens = vocabulary_words[sorted_indexes[-10:]][::-1]
bottom_10_tokens = vocabulary_words[sorted_indexes[:10]]

print("Top 10 frequent tokens:")
print(top_10_tokens)

print("\nBottom 10 frequent tokens:")
print(bottom_10_tokens)

# (4634, 8000)
# (8000,)
# (8000,)
# (8000,)
# Top 10 frequent tokens:
# ['what' 'how' 'who' 'many' 'name' 'how many' 'where' 'first' 'when'
#  'world']

# Bottom 10 frequent tokens:
# ['peugeot' 'poing french' 'poing' 'poets society' 'poets' 'poetic meter'
#  'poetic' 'poet penned' 'poet co' 'poems made']

### Extract Features (TF-IDF)

由于ngrams方法的限制，也就是出现频率越高，它就越重要，TF-IDF则可以解决这个问题。

* Term: A term refers to an item in your vocabulary. For instance, in the above example, “films” is a term.
* Document: In this context, the entire sentence “Which films featured the character Popeye Doyle?” is considered a document.
* Collection: The collection is a group of documents, such as your training dataset.

TF (term frequency) measures how many times a term appears in a document, divided by the total number of terms in the collection. IDF (inverse document frequency) assesses how many documents contain a term relative to the total number of documents. Rare words appearing in only a few documents have higher IDF scores, whereas common words like “what” receive lower scores.

TF是一个单词在一个文档中的次数/所有文档中的次数，IDF包含该词的文档数量/文档数量。

In [None]:
tfidf_vect = TfidfVectorizer(ngram_range=(1, 2), max_features=8000)
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_valid_tfidf = tfidf_vect.transform(X_valid)
X_test_tfidf = tfidf_vect.transform(X_test)

### Linear Model solution - logistic

In [None]:
clf = LogisticRegression(random_state=42)
clf.fit(X_train_tfidf, y_train)

def get_accuracy(clf, X_data_split, y_data_split, split_name):
    y_pred = clf.predict(X_data_split)
    acc_score = accuracy_score(y_data_split, y_pred)
    print(f'Accuracy on {split_name} : {acc_score}')
    return acc_score

# evaluate the model
get_accuracy(clf, X_train_tfidf, y_train, 'train')
get_accuracy(clf, X_valid_tfidf, y_valid, 'valid')
get_accuracy(clf, X_test_tfidf, y_test, 'test')

# Accuracy on train : 0.9596460940871817
# Accuracy on valid : 0.8227383863080685
# Accuracy on test : 0.836

### Tune Hyperparameters

In [None]:
# one way is just change the optimizer and regularizer
clf = LogisticRegression(
    random_state = 42, 
    solver = 'saga',
    penalty = 'elasticnet',
    l1_ratio = 0.0025,
    max_iter = 500
)

clf.fit(X_train_tfidf, y_train)
get_accuracy(clf, X_train_tfidf, y_train, 'train')
get_accuracy(clf, X_valid_tfidf, y_valid, 'valid')
get_accuracy(clf, X_test_tfidf, y_test, 'test')

# Accuracy on train : 0.9594302977988779
# Accuracy on valid : 0.8227383863080685
# Accuracy on test : 0.836

In [None]:
# Randomized Search CV
model = LogisticRegression()

# define hyperparameter grid
param_dist = {
    'C': uniform(loc=0, scale=4),  # Range for regularization strength (log-scale)
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], # choice of optimizing algorithm
    'max_iter': np.arange(100, 500, 100),  # Range for maximum iterations
}

# create RandomizedSearchCV
random_search = RandomizedSearchCV(
    model,
    param_distributions = param_dist,
    n_iter = 15,
    scoring = 'accuracy',
    cv = 5,
    random_state = 42,
    n_jobs = -1
)

# fit the randomized search to data
random_search.fit(X_train_tfidf, y_train)

In [None]:
# Train a new model
# get the best hyperparameters and corr esponding score
best_params = random_search.best_params_
best_score = random_search.best_score_

print('Best Hyperparameters:', best_params)
print('Best Accuracy Score', best_score)

# fit the model on train data
best_model = LogisticRegression(**best_params)
best_model.fit(X_train_tfidf, y_train)

# Best Hyperparameters: {'C': 3.7542108360630007, 'max_iter': 200, 'solver': 'sag'}
# Best Accuracy Score 0.820242497105086

In [None]:
# evaluate the model on test data
get_accuracy(best_model, X_test_tfidf, y_test, 'test')

# Accuracy on test : 0.848
# get a better score

### Ensemble solution

In [None]:
bst = XGBClassifier()
bst.fit(X_train_tfidf, y_train)

get_accuracy(bst, X_train_tfidf, y_train, 'train')
get_accuracy(bst, X_valid_tfidf, y_valid, 'valid')
get_accuracy(bst, X_test_tfidf, y_test, 'test')

# Accuracy on train : 0.9115235217954252
# Accuracy on valid : 0.7701711491442543
# Accuracy on test : 0.808

In [None]:
# evaluate the model on cunfusion matrix
y_pred = bst.predict(X_test_tfidf)
cm = confusion_matrix(y_test, y_pred, labels=bst.classes_)

# display
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
plt.show()

# compute precision, recall and f1 score
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print(precision, recall, f1)

# (0.8611752545511348, 0.7591300972695163, 0.7922182374995442)

### Neural Network (Linear) solution

In [None]:
# implement a neural network class
class ClassificationNet(nn.Module):
    def __init__(self):
        super(ClassificationNet, self).__init__()
        self.fc1 = nn.Linear(8000, 1024)
        self.fc2 = nn.Linear(1024, 256)
        self.fc3 = nn.Linear(256, 6)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
# create a custom dataset class
class TfidfDataset(Dataset):
    def __init__(self, tfidf_vectors, labels, transform=None, target_transform=None):
        self.labels = torch.tensor(labels.values)
        self.feature_vectors = torch.tensor(csr_matrix.todense(tfidf_vectors)).float()
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        feature_vector = self.feature_vectors[idx]
        label = self.labels[idx]
        if self.transform:
            feature_vector = self.transform(feature_vector)
        if self.target_transform:
            label = self.target_transform(label)
        return feature_vector, label

In [None]:
# instantiate TfidfDataset objects
train_dataset = TfidfDataset(X_train_tfidf, y_train)
valid_dataset = TfidfDataset(X_valid_tfidf, y_valid)
test_dataset = TfidfDataset(X_test_tfidf, y_test)

In [None]:
# set batch size
batch_size = 64
# instantiate dataloader objects
tfidf_train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
tfidf_valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
tfidf_test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# instantiate the model, optimizer, loss function, and device
net = ClassificationNet()
optimizer = optim.Adam(net.parameters(), lr=0.002)
criterion = nn.CrossEntropyLoss()
device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)
print(device)
net = net.to(device)

In [None]:
# training function
def train_nn(
    net,
    optimizer,
    criterion,
    train_dataloader,
    valid_dataloader,
    n_epochs = 3
):
    len_train_dataloader = len(train_dataloader)
    len_valid_dataloader = len(valid_dataloader)
    train_losses, valid_losses = [], []

    for epoch in range(n_epochs):
        total_train_loss = 0.0
        net.train()

        for i, data in enumerate(train_dataloader):
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
        
        with torch.no_grad():
            net.eval()
            total_valid_loss = 0.0

            for data in valid_dataloader:
                valid_inputs, valid_labels = data
                valid_inputs = valid_inputs.to(device)
                valid_labels = valid_labels.to(device)

                outputs = net(valid_inputs)

                total_valid_loss += criterion(outputs, valid_labels).item()
        
        train_losses.append(total_train_loss/len_train_dataloader)
        valid_losses.append(total_valid_loss/len_valid_dataloader)

    print('Finished Training')
    return train_losses, valid_losses

In [None]:
# plot learning curve function
def plot_learning_curve(train_losses, valid_losses):
    plt.plot(train_losses, label='Training loss')
    plt.plot(valid_losses, label='Validation loss')

    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [None]:
# evaluate function
def evaluate_classnet(model, dataloader):
    correct = 0
    total = 0
    predictions = []
    with torch.no_grad():
        model.eval()
        for data in dataloader:
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            predictions.extend(predicted.tolist())
    accuracy = 100 * correct // total
    precision = precision_score(y_test, predictions, average='macro')
    recall = recall_score(y_test, predictions, average='macro')
    f1 = f1_score(y_test, predictions, average='macro')
    print(f'Accuracy: {accuracy} %')
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    return predictions, accuracy

In [None]:
n_epochs = 4
train_losses, valid_losses = train_nn(
    net,
    optimizer,
    criterion,
    tfidf_train_dataloader,
    tfidf_valid_dataloader,
    n_epochs = n_epochs
)

In [None]:
plot_learning_curve(train_losses, valid_losses)

In [None]:
predictions, accuracy = evaluate_classnet(net, tfidf_test_dataloader)

### Word Embeddings (GloVe) + LSTM solution

In [None]:
# load embeddings into a dictionary
glove_embed_dict = {}
word_to_index = {}
index_to_word = {}
with open('/glove6B/glove.6B.200d.txt','r') as f:
    for index, line in enumerate(f):
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:],'float32')
        glove_embed_dict[word] = vector
        index_to_word[index] = word
        word_to_index[word] = index

In [None]:
# add pad and unk tokens to the dict
glove_embed_dict['<unk>'] = np.random.rand(200)
glove_embed_dict['<pad>'] = np.zeros(200)
index = len(glove_embed_dict)+1
word_to_index['<unk>'] = index
index_to_word[index ] = '<unk>'
index +=1
word_to_index['<pad>'] = index
index_to_word[index] = '<pad>'

In [None]:
# calculate average sequence length
total_seq_len = 0
max_seq_len = 0
for line in train_df['text']:
    seq_len = len(line.split(' '))
    total_seq_len += seq_len
    max_seq_len = max(max_seq_len, seq_len)

avg_seq_len = total_seq_len/len(train_df['text'])
print(f'Average seq len: {avg_seq_len}')
print(f'Max seq len: {max_seq_len}')

# Average seq len: 7.090975788701394
# Max seq len: 25

In [None]:
# implement the GloveDataset
class GloveDataset(Dataset):
    def __init__(self, text, labels, word_to_index, index_to_word, glove_embed_dict, max_seq_len=20):
        self.labels = torch.tensor(labels.values)
        self.glove_embed_dict = glove_embed_dict
        self.word_to_index = word_to_index
        self.index_to_word = index_to_word
        self.max_seq_len = max_seq_len
        self.index_seq = []
        for line in text:
            indices = [self.word_to_index.get(word, self.word_to_index.get('<unk>')) for word in line.split()]
            pad_length = self.max_seq_len - len(indices)
            indices = indices[:self.max_seq_len] + [self.word_to_index.get('<pad>')] * pad_length
            self.index_seq.append(indices)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        indices = self.index_seq[idx]
        label = self.labels[idx]
        words = [self.index_to_word[i] for i in indices]
        embeddings = torch.tensor([self.glove_embed_dict[word] for word in words], dtype=torch.float)
        return embeddings, label

In [None]:
# define GloveClassifier
class GloveClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(GloveClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.linear_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        out = self.linear_layer(hn[-1])
        return out

In [None]:
# set model and data config
input_dim = 200
hidden_dim = 128
output_dim = 6
num_layers = 2 
batch_size = 16
learning_rate = 0.002
n_epochs = 10

In [None]:
# instantiate dataset and dataloader
train_dataset = GloveDataset(X_train, y_train, word_to_index, index_to_word, glove_embed_dict)
valid_dataset = GloveDataset(X_valid, y_valid, word_to_index, index_to_word, glove_embed_dict)
test_dataset = GloveDataset(X_test, y_test, word_to_index, index_to_word, glove_embed_dict)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# instantiate neural netword
model = GloveClassifier(input_dim, hidden_dim, output_dim, num_layers)

In [None]:
# instantiate loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
model.to(device)

In [None]:
# train the model
train_losses, valid_losses = train_nn(
    model, 
    optimizer,
    criterion,
    train_dataloader,
    valid_dataloader,
    n_epochs = n_epochs
)

In [None]:
# plot the learning curve and evaluate the model
plot_learning_curve(train_losses, valid_losses) # this plot is very well
predictions, accuracy = evaluate_classnet(model, test_dataloader) 

# Accuracy: 85 %
# Precision: 0.7231365957686823
# Recall: 0.72528989139192
# F1 Score: 0.7212330021413701

### Pretrained Language Models solution 

GloVe and Word2vec可以处理语义但是不擅长处理上下文，这里尝试使用MobileBERT，一个bert的轻量版。

In [None]:
# define the model and config
max_len = 64
batch_size = 32
n_epochs = 3
learning_rate = 1e-04

In [None]:
# instaniate tokenizer
tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')

In [None]:
# implement custom dataset
class MobileBertDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        text = self.texts.iloc[index]
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.labels.iloc[index], dtype=torch.long)
        }

In [None]:
# implement model
class MobileBertClassNet(torch.nn.Module):
    def __init__(self, n_classes=6):
        super(MobileBertClassNet, self).__init__()
        self.l1 = MobileBertModel.from_pretrained("google/mobilebert-uncased")
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(512, n_classes)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooled_output = hidden_state[:, 0]
        x = self.fc1(self.dropout(pooled_output))
        return x

In [None]:
def evaluate_mobilebert(model, dataloader):
    with torch.no_grad():
        model.eval()
        total_loss = 0
        total_size = 0
        correct_pred = 0
        predictions = []
        for data in dataloader:
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            labels = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask)
            total_loss += criterion(outputs, labels).item()
            _, predicted = torch.max(outputs.data, 1) # (value, index)
            total_size += labels.size(0)
            correct_pred += (predicted == labels).sum().item()
            predictions.extend(predicted.tolist())
        accuracy = 100 * correct_pred // total_size
        loss = total_loss/len(dataloader)
        print(f'Accuracy : {accuracy} %\tLoss: {loss:.4f}')
        return  {'predictions' :predictions, 'accuracy': accuracy, 'loss': loss}

In [None]:
# implement train function
def train_mobilebert(model, criterion, optimizer, train_dataloader, valid_dataloader, n_epochs):
    train_losses = []
    valid_losses = []
    len_train_dataloader = len(train_dataloader)
    len_valid_dataloader = len(valid_dataloader)
    for epoch in range(n_epochs):  #loop over the dataset
        total_train_loss = 0.0
        model.train()
        for i, data in enumerate(train_dataloader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            labels = data['targets'].to(device, dtype = torch.long)
            optimizer.zero_grad() # zero the parameter gradients
            outputs = model(ids, mask) # forward + backward + optimize
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
        valid_loss = evaluate_mobilebert(model, valid_dataloader)['loss']
        train_losses.append(total_train_loss/len_train_dataloader)
        valid_losses.append(valid_loss)
    return train_losses, valid_losses 

In [None]:
# split the data
X = train_orig['text']
y = train_df['main_cat']
X_test = test_orig['text']
y_test = test_df['main_cat']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=1600, test_size=160, random_state=42, stratify=y)

In [None]:
# instaniate dataset and dataloader
train_dataset = MobileBertDataset(X_train, y_train, tokenizer, max_len)
valid_dataset = MobileBertDataset(X_valid, y_valid, tokenizer, max_len)
test_dataset = MobileBertDataset(X_test, y_test, tokenizer, max_len)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
# instantiate model
model = MobileBertClassNet()
model.to(device)

# MobileBertClassNet(
#   (l1): MobileBertModel(
#     (embeddings): MobileBertEmbeddings(
#       (word_embeddings): Embedding(30522, 128, padding_idx=0)
#       (position_embeddings): Embedding(512, 512)
#       (token_type_embeddings): Embedding(2, 512)
#       (embedding_transformation): Linear(in_features=384, out_features=512, bias=True)
#       (LayerNorm): NoNorm()
#       (dropout): Dropout(p=0.0, inplace=False)
#     )
#     (encoder): MobileBertEncoder(
#       (layer): ModuleList(
#         (0-23): 24 x MobileBertLayer(
#           (attention): MobileBertAttention(
#             (self): MobileBertSelfAttention(
#               (query): Linear(in_features=128, out_features=128, bias=True)
#               (key): Linear(in_features=128, out_features=128, bias=True)
#               (value): Linear(in_features=512, out_features=128, bias=True)
#               (dropout): Dropout(p=0.1, inplace=False)
#             )
#             (output): MobileBertSelfOutput(
#               (dense): Linear(in_features=128, out_features=128, bias=True)
#               (LayerNorm): NoNorm()
#             )
#           )
# ...
#     (pooler): MobileBertPooler()
#   )
#   (dropout): Dropout(p=0.3, inplace=False)
#   (fc1): Linear(in_features=512, out_features=6, bias=True)
# )

In [None]:
# create optimizer and loss function
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [None]:
# freeze the layer of the model
for param in model.l1.parameters():
    param.requires_grad = False

In [None]:
# print the name of trainable parameter
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

# fc1.weight
# fc1.bias

In [None]:
# train the model
train_losses, valid_losses = train_mobilebert(
    model,
    criterion,
    optimizer,
    train_dataloader,
    valid_dataloader,
    n_epochs
)

In [None]:
# plot the learning curve
plot_learning_curve(train_losses, valid_losses)

In [None]:
# calculate res on test data
results = evaluate_mobilebert(model, test_dataloader)

# Accuracy : 27 %	Loss: 863967.1914

### Handle Imbalanced Data

Data imbalance in classification, where certain classes are underrepresented, can impair model performance. The dataset has this issue. To counteract it, you can follow the methods below:

Generate synthetic data: Use techniques like SMOTE or advanced generative AI for new minority class samples.

Oversample the minority class: Increase minority class representation by replicating its samples.

Undersample the majority class: Reduce majority class samples to balance distribution.

Adjust loss function weights: Increase penalties for misclassifying minority classes to focus model learning.

Apply transfer learning: Start with models pre-trained on diverse datasets for better initial learning.

In [None]:
# print class distribution
print(y_train.value_counts()/len(y_train))
print(y_test.value_counts()/len(y_test))

# main_cat
# 2    0.229375
# 3    0.224375
# 1    0.213125
# 5    0.164375
# 4    0.153125
# 0    0.015625
# Name: count, dtype: float64
# main_cat
# 1    0.276
# 5    0.226
# 2    0.188
# 4    0.162
# 3    0.130
# 0    0.018
# Name: count, dtype: float64

In [None]:
# implement get class weight
def get_class_weight(y_split):
    class_list = np.unique(y_split)
    class_weight_value = compute_class_weight(class_weight='balanced', classes=class_list, y=y_split)
    weight_dict = {}
    weight_list = []
    for i in range(len(class_list)):
        weight_dict[class_list[i]] = class_weight_value[i]
        weight_list.append(class_weight_value[i])
    return weight_dict, weight_list

In [None]:
# create class weight tensor
weight_dict, weight_list = get_class_weight(y_train)
class_weights = torch.tensor(weight_list, dtype=torch.float32)
class_weights = class_weights.to(device)

In [None]:
# instantiate loss function
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [None]:
# retrain tfidf
net = ClassificationNet()
optimizer = optim.Adam(net.parameters(), lr=0.002)
net.to(device)
train_losses, valid_losses = train_nn(
    net, 
    optimizer, 
    criterion, 
    tfidf_train_dataloader,
    tfidf_valid_dataloader, 
    n_epochs = n_epochs
)

In [None]:
plot_learning_curve(train_losses, valid_losses)

In [None]:
predictions, accuracy = evaluate_classnet(net, tfdif_test_dataloader)  

# Accuracy: 83 %
# Precision: 0.8287213357331736
# Recall: 0.8267149068345564
# F1 Score: 0.8259239241514092

### Save model

In [None]:
path = "/usercode/tfidf_model.pt"
torch.save(net.state_dict(), path)

In [None]:
loaded_model = ClassificationNet()
loaded_model.load_state_dict(torch.load(path))