### Import Libraries

In [1]:
import pandas as pd
import random
import numpy as np
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import multilabel_confusion_matrix
import gensim.downloader

from datetime import datetime
from packaging import version
from tensorflow import keras

%load_ext tensorboard

### Data Preprocessing

In [2]:
# Only run this cell if you want to generate a new validation set
df = pd.read_csv("trec/original/train.csv")
# TODO: check w group if we need this
# Remove duplicates from train
df.drop_duplicates(subset='text', keep='first', inplace=True)
# Randomly choose 500 rows to drop 
num_rows_to_drop = 500
np.random.seed(42)
rows_to_drop = np.random.choice(df.index, num_rows_to_drop, replace=False)
# print(rows_to_drop)
# Create a development dataframe from these 500 dropped rows
validation_df = df.loc[rows_to_drop].copy()

# Reset index of development dataframe and export to csv
validation_df.reset_index(drop=True, inplace=True)
validation_df.to_csv("trec/generated/validation.csv",index=None)

df_copy = df.copy(deep=True)
# Drop validation rows from original dataset, export as csv
df_copy.drop(rows_to_drop, inplace=True)
df_copy.reset_index(drop=True, inplace=True)
df_copy.to_csv("trec/generated/train.csv",index=None)

In [3]:
# Run this cell if you're generating a new validation set for sanity checking
def check_unique_texts(train_csv_file, validation_csv_file):
    train_df = pd.read_csv(train_csv_file)
    validation_df = pd.read_csv(validation_csv_file)

    train_texts = train_df['text']
    validation_texts = validation_df['text']

    common_texts = validation_texts[validation_texts.isin(train_texts)]

    if common_texts.empty:
        print("Validation set and train sets are unique")
    else:
        print("Common values found in the 'text' column:")
        print(common_texts)

train_csv_file = "trec/generated/train.csv"
validation_csv_file = "trec/generated/validation.csv"
check_unique_texts(train_csv_file, validation_csv_file)

Validation set and train sets are unique


In [4]:
# check for duplicates from test - remove if there are any
df = pd.read_csv("trec/original/test.csv")
df.drop_duplicates(subset='text', keep='first', inplace=True)
df.to_csv('trec/generated/test.csv',index=None)

In [5]:
train_df=pd.read_csv('trec/generated/train.csv')
val_df=pd.read_csv('trec/generated/validation.csv')
test_df=pd.read_csv('trec/generated/test.csv')

train_df.drop(columns='label-fine', inplace=True)
val_df.drop(columns='label-fine', inplace=True)
test_df.drop(columns='label-fine', inplace=True)

In [6]:
value_counts = train_df['label-coarse'].value_counts()
print(value_counts)

label-coarse
1    1132
3    1091
0    1051
4     781
5     746
2      80
Name: count, dtype: int64


### Assign 2 random classes to OTHERS

In [7]:
classes=train_df['label-coarse'].unique()
random.shuffle(classes)
print('Randomly selected OTHERS classes: ',classes[:2])

for i in classes[:2]:
    train_df['label-coarse']=train_df['label-coarse'].apply(lambda x:'OTHERS' if x==i else x)
    val_df['label-coarse']=val_df['label-coarse'].apply(lambda x:'OTHERS' if x==i else x)
    test_df['label-coarse']=test_df['label-coarse'].apply(lambda x:'OTHERS' if x==i else x)



Randomly selected OTHERS classes:  [5 1]


In [8]:
print(train_df['label-coarse'].unique())

[0 'OTHERS' 2 3 4]


In [9]:
mapping_dict = {item: idx for idx, item in enumerate(set(train_df['label-coarse'].unique()))}
train_df['label-coarse']=train_df['label-coarse'].apply(lambda x:mapping_dict[x])
val_df['label-coarse']=val_df['label-coarse'].apply(lambda x:mapping_dict[x])
test_df['label-coarse']=test_df['label-coarse'].apply(lambda x:mapping_dict[x])

print('Mapping Dictionary: ',mapping_dict)

Mapping Dictionary:  {0: 0, 2: 1, 3: 2, 4: 3, 'OTHERS': 4}


In [10]:
for df in [train_df,val_df,test_df]:
    print(df['label-coarse'].unique())

[0 4 1 2 3]
[2 0 4 3 1]
[3 4 2 0 1]


### word2vec

In [11]:
wv_model = gensim.downloader.load('word2vec-google-news-300')

max_sequence_length = 100  # Define the maximum sequence length

def tokenize_and_vectorize(texts, word_vectors, max_sequence_length):
    sequences = []
    for text in texts:
        words = text.split()
        word_vectors_list = []
        for word in words:
            if word in word_vectors:
                word_vectors_list.append(word_vectors[word])
        sequences.append(word_vectors_list)

    X = pad_sequences(sequences, maxlen=max_sequence_length, dtype='float32')

    return X

### Tokenize and vectorize features

In [12]:
process_features=lambda df:tokenize_and_vectorize(df['text'],wv_model,max_sequence_length)

X_train=process_features(train_df)
X_val=process_features(val_df)
X_test=process_features(test_df)

y_train = np.array(train_df['label-coarse'])
y_val = np.array(val_df['label-coarse'])
y_test = np.array(test_df['label-coarse'])

### Models

In [13]:
def lstm_max_pooling_model(units):
    model = tf.keras.Sequential([
    tf.keras.layers.LSTM(units, return_sequences=True),
    tf.keras.layers.MaxPooling1D(pool_size=2),  # Max pooling layer
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(5, activation='softmax')
    ])
    return model

def lstm_avg_pooling_model(units):
    model = tf.keras.Sequential([
    tf.keras.layers.LSTM(units, return_sequences=True),
    tf.keras.layers.GlobalAveragePooling1D(),  # Max pooling layer
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(5, activation='softmax')
    ])
    return model

def simple_lstm_model(units):
    model = tf.keras.Sequential([
    tf.keras.layers.LSTM(units),
    tf.keras.layers.Dense(5, activation='softmax')
    ])
    return model

def max_pooling_model(hidden_size,output_size):
    model = tf.keras.Sequential([
    tf.keras.layers.GlobalMaxPooling1D(),  # Max pooling over the sequence
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(hidden_size, activation='relu'),
    tf.keras.layers.Dense(output_size, activation='softmax')
    ])
    return model

def avg_pooling_model(hidden_size,output_size):
    model = tf.keras.Sequential([
    tf.keras.layers.GlobalAveragePooling1D(),  
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(hidden_size, activation='relu'),
    tf.keras.layers.Dense(hidden_size, activation='relu'),
    tf.keras.layers.Dense(output_size, activation='softmax')
    ])
    return model


### Find optimal hyperparamters

In [14]:
def find_optimal_hyperparameters(param_grid,model_type):
    # Initialize early stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

    # Initialize best model and best loss
    best_model = None
    best_loss = np.inf

    # Iterate over the parameter grid
    for params in ParameterGrid(param_grid):
        units = params['hidden_size']

        if model_type=='lstm_max_pooling':
            model=lstm_max_pooling_model(units)
        elif model_type=='lstm_avg_pooling':
            model=lstm_avg_pooling_model(units)
        elif model_type=='simple_lstm':
            model=simple_lstm_model(units)
        elif model_type=='max_pooling':
            model=max_pooling_model(units,output_size=5)
        elif model_type=='avg_pooling':
            model=avg_pooling_model(units,output_size=5)

        model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

        # Train the model with early stopping
        history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=32, callbacks=[early_stopping], verbose=0)

        # Evaluate the model on the validation set
        val_loss = history.history['val_loss'][-1]

        # Check if the current model is the best
        if val_loss < best_loss:
            best_model = model
            best_loss = val_loss

    start_time = time.time()
    
    logdir="logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)
    
    # Train the best model on the full training set
    best_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=32, callbacks=[early_stopping, tensorboard_callback], verbose=1)
    predictions = best_model.predict(X_test)
    print(model.summary())

    print(f'Training Runtime: {time.time()-start_time:.2f} seconds')
    print(f'Train Accuracy: {max(history.history["accuracy"]) * 100:.2f}%')

    # Evaluate the best model on the test set
    test_loss, test_accuracy = best_model.evaluate(X_test, y_test)
    print(f'Test Accuracy: {test_accuracy * 100:.2f}%')
    
    return predictions

### Model Train

In [15]:
def train(param_grid,model_type):
    predictions=find_optimal_hyperparameters(param_grid,model_type)

    predicted_labels = np.argmax(predictions, axis=1)
    label_counts = np.bincount(predicted_labels)

    # Print the number of values for each label
    for label, count in enumerate(label_counts):
        print(f"Label {label}: {count} instances")

    confusion_matrix = multilabel_confusion_matrix(y_test, predicted_labels)
    confusion_matrix

In [16]:
# Define hyperparameter grid for tuning
param_grid = {
    'hidden_size': [64, 128, 256],  # hidden size
}

### LSTM with max pooling

In [17]:
train(param_grid,'lstm_max_pooling')

Epoch 11: early stopping
Epoch 11: early stopping
Epoch 12: early stopping
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 6: early stopping
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 100, 256)          570368    
                                                                 
 max_pooling1d_2 (MaxPoolin  (None, 50, 256)           0         
 g1D)                                                            
                                                                 
 flatten_2 (Flatten)         (None, 12800)             0         
                                                                 
 dense_2 (Dense)             (None, 5)                 64005     
                                                                 
Total params: 634373 (2.42 MB)
Trainable params: 634373 (2.42 MB)
Non-trainable p

In [18]:
%tensorboard --logdir logs

### LSTM with average pooling

In [20]:
train(param_grid,'lstm_avg_pooling')

Epoch 24: early stopping
Epoch 31: early stopping
Epoch 19: early stopping
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 8: early stopping
Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_7 (LSTM)               (None, 100, 256)          570368    
                                                                 
 global_average_pooling1d_4  (None, 256)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 flatten_7 (Flatten)         (None, 256)               0         
                                                                 
 dense_7 (Dense)             (None, 5)                 1285      
                                                                 
Total params: 571653 (2.18 MB)
Trainable params: 571653 (

In [21]:
%tensorboard --logdir logs

### Simple LSTM

In [22]:
train(param_grid,'simple_lstm')

Epoch 12: early stopping
Epoch 12: early stopping
Epoch 12: early stopping
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 7: early stopping
Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_10 (LSTM)              (None, 256)               570368    
                                                                 
 dense_10 (Dense)            (None, 5)                 1285      
                                                                 
Total params: 571653 (2.18 MB)
Trainable params: 571653 (2.18 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Training Runtime: 38.73 seconds
Train Accuracy: 98.14%
Test Accuracy: 93.00%
Label 0: 139 instances
Label 1: 11 instances
Label 2: 65 instances
Label 3: 108 instances
Label 4: 177 instances


In [23]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 72661), started 0:13:27 ago. (Use '!kill 72661' to kill it.)

### Max pooling

In [24]:
train(param_grid,'max_pooling')

Epoch 18: early stopping
Epoch 21: early stopping
Epoch 11: early stopping
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 10: early stopping
Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 global_max_pooling1d_2 (Gl  (None, 300)               0         
 obalMaxPooling1D)                                               
                                                                 
 flatten_10 (Flatten)        (None, 300)               0         
                                                                 
 dense_15 (Dense)            (None, 256)               77056     
                                                                 
 dense_16 (Dense)            (None, 5)                 1285      
                                                                 
Total params: 78341 (306.02 KB

In [25]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 72661), started 0:16:20 ago. (Use '!kill 72661' to kill it.)

### Average pooling

In [26]:
train(param_grid,'avg_pooling')

Epoch 20: early stopping
Epoch 27: early stopping
Epoch 19: early stopping
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 9: early stopping
Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 global_average_pooling1d_7  (None, 300)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 flatten_13 (Flatten)        (None, 300)               0         
                                                                 
 dense_23 (Dense)            (None, 256)               77056     
                                                                 
 dense_24 (Dense)            (None, 256)               65792     
                                                                 
 dense_25 (Dense)            (None, 5)      

In [27]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 72661), started 0:17:08 ago. (Use '!kill 72661' to kill it.)

###  Averaging over word representations

In [28]:
def sentence_to_vectors(sentence, wv_model):
    words = sentence.split()
    vectors = [wv_model[word] if word in wv_model else np.zeros(300) for word in words]
    return np.mean(vectors, axis=0)

process_mean_features=lambda df:np.array([sentence_to_vectors(sentence,wv_model) for sentence in df['text']])

X_train=process_mean_features(train_df)
X_val=process_mean_features(val_df)
X_test=process_mean_features(test_df)

y_train = np.array(train_df['label-coarse'])
y_val = np.array(val_df['label-coarse'])
y_test = np.array(test_df['label-coarse'])

In [30]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

model = Sequential()
model.add(Dense(128, input_dim=300, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(5, activation='softmax')) 

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

logdir="logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, callbacks=[early_stopping,tensorboard_callback], epochs=100)

test_loss, test_accuracy = model.evaluate(X_test, y_test)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 13: early stopping


In [32]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 72661), started 0:24:58 ago. (Use '!kill 72661' to kill it.)