In [1]:
import pandas as pd
import numpy as np
import pickle
import pandas as pd
import itertools
from collections import Counter
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models import word2vec
from sklearn.linear_model import LogisticRegression
import os
import string

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load Preprocessed Data from preprocessed.ipynb

In [11]:
# Load the preprocessed DataFrame from a Pickle file
with open('df_train_preprocessed.pkl', 'rb') as file:
    df_train = pickle.load(file)

with open('df_test_preprocessed.pkl', 'rb') as file:
    df_test = pickle.load(file)

# Build Vocabulary

In [12]:
# A function used to build a vocabulary based on descending word frequencies
def build_vocab(sentences):
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return word_counts, vocabulary, vocabulary_inv

In [13]:
# tokenization
tagged_data = [word_tokenize(_d) for i, _d in enumerate(df_train["text"])]
# build vocabulary from tokenized data
word_counts, vocabulary, vocabulary_inv = build_vocab(tagged_data)
# use the above mapping to create input data
inp_data = [[vocabulary[word] for word in text] for text in tagged_data]

# Define x_train and x_test with 2 options

In [None]:
# option 1: Using keras default
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

vocabulary = 10000
tokenizer = Tokenizer(num_words=vocabulary)

# Tokenize the preprocessed texts
texts_train = df_train['text'].tolist()
texts_test = df_test['text'].tolist()

tokenizer.fit_on_texts(texts_train)

# Convert texts to sequences
sequence_train = tokenizer.texts_to_sequences(texts_train)
sequence_test = tokenizer.texts_to_sequences(texts_test)

# Define the maximum number of words in a sequence
word_num = 350

# Pad sequences to ensure uniform length
x_train = pad_sequences(sequence_train, maxlen=word_num)
x_test = pad_sequences(sequence_test, maxlen=word_num)

In [15]:
# Option2: Manually set vacabulary
from tensorflow.keras.preprocessing.sequence import pad_sequences

word_num = 350

x_train = pad_sequences(inp_data, maxlen=word_num)

inp_data_test = [[vocabulary.get(word, 0) for word in word_tokenize(text)] 
                 for text in df_test["text"].tolist()]

x_test = pad_sequences(inp_data_test, maxlen=word_num)


# Change labels into numeric encode

In [17]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

labels_train = df_train['label'].values
label_encoder = LabelEncoder()
labels_train_int = label_encoder.fit_transform(labels_train)
labels_train = to_categorical(labels_train_int, num_classes=10)

labels_train

array([[0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [20]:
from sklearn.model_selection import train_test_split

# Split the training data into training and validation sets
x_train, x_valid, labels_train, labels_valid = train_test_split(
    x_train, labels_train, test_size=0.20, random_state=42)

In [None]:
"""
for feature
array([[   0,    0,    0, ...,    3,   61,   74],
       [   0,    0,    0, ...,  324,  170,  238],
       [   0,    0,    0, ...,  561, 4440,  229],
       ...,
       [   0,    0,    0, ...,   16,    7,  438],
       [   0,    0,    0, ...,  340,  202,  144],
       [   0,    0,    0, ...,  588,  448, 4507]], dtype=int32)

for labels
array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]])
"""

# Tune the hyperparameter

In [21]:
vocabulary_size = len(vocabulary_inv)

In [24]:
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from keras_tuner import HyperModel, BayesianOptimization

class LSTMHyperModel(HyperModel):
    def __init__(self, vocabulary, word_num, output_units):
        self.vocabulary = vocabulary
        self.word_num = word_num
        self.output_units = output_units

    def build(self, hp):

        embedding_dim = hp.Int('embedding_dim', min_value=16, max_value=128, step=16)
        
        model = Sequential()
        model.add(Embedding(input_dim=self.vocabulary, output_dim=embedding_dim, input_length=self.word_num))
        model.add(LSTM(units=hp.Int('units', min_value=32, max_value=512, step=32), return_sequences=False))
        model.add(Dense(units=self.output_units, activation='softmax'))

        hp_learning_rate = hp.Choice('learning_rate', values=[1e-3])
        model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
        return model

"""
    def build(self, hp):
        
        embedding_dim = hp.Int('embedding_dim', min_value=16, max_value=128, step=16)
        
        model = Sequential()
        model.add(Embedding(self.vocabulary, embedding_dim, input_length=self.word_num))
        model.add(LSTM(units=hp.Int('units_1', min_value=32, max_value=512, step=32), return_sequences=True))
        #model.add(LSTM(units=hp.Int('units_2', min_value=16, max_value=512, step=32), return_sequences=False))
        model.add(Dense(self.output_units, activation='softmax'))

        hp_learning_rate = hp.Choice('learning_rate', values=[1e-3])

        model.compile(optimizer=RMSprop(learning_rate=hp_learning_rate),
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
        return model
"""
    

hypermodel = LSTMHyperModel(vocabulary= vocabulary_size, word_num=350, output_units=10)

tuner = BayesianOptimization(
    hypermodel,
    objective='val_accuracy',
    max_trials=3,
    num_initial_points=2,
    directory='my_dir',
    project_name='LSTM+Linear'
)


In [25]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_accuracy',  
    patience=3,              
    restore_best_weights=True
)

tuner.search(
    x_train, labels_train,
    epochs=10,
    validation_data=(x_valid, labels_valid),
    callbacks=[early_stopping]  
)

Trial 2 Complete [00h 34m 03s]
val_accuracy: 0.7485736012458801

Best val_accuracy So Far: 0.7485736012458801
Total elapsed time: 00h 45m 24s

Search: Running Trial #3

Value             |Best Value So Far |Hyperparameter
128               |128               |embedding_dim
416               |384               |units
0.001             |0.001             |learning_rate

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

KeyboardInterrupt: 

In [26]:
best_model = tuner.get_best_models(num_models=1)[0]
best_hyperparameters = tuner.get_best_hyperparameters()[0]

print('Best model summary:')
best_model.summary()
print('Best hyperparameters:', best_hyperparameters.values)

Best model summary:
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 350, 128)          5927552   
                                                                 
 lstm (LSTM)                 (None, 384)               787968    
                                                                 
 dense (Dense)               (None, 10)                3850      
                                                                 
Total params: 6719370 (25.63 MB)
Trainable params: 6719370 (25.63 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Best hyperparameters: {'embedding_dim': 128, 'units': 384, 'learning_rate': 0.001}


# Train the network using 5 fold CV

In [None]:
"""
from sklearn.model_selection import KFold
import numpy as np

# Define the K-Fold Cross-Validator
num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)

fold_no = 1
acc_per_fold = []
loss_per_fold = []

for train, test in kfold.split(x_train, labels_train):
    # Clone the best model
    model = best_model  # Assuming `best_model` is already defined

    # Generate a print
    print(f'Training for fold {fold_no} ...')

    # Fit data to model
    history = model.fit(x_train[train], labels_train[train],
                        batch_size=32,
                        epochs=10,
                        verbose=1,
                        validation_split=0.2,  # Consider adjusting this value
                        callbacks=[early_stopping])

    # Generate generalization metrics
    scores = model.evaluate(x_train[test], labels_train[test], verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    acc_per_fold.append(scores[1] * 100)
    loss_per_fold.append(scores[0])

    # Increase fold number
    fold_no = fold_no + 1

#== Provide average scores ==
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(len(acc_per_fold)):
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
avg_loss = np.mean(loss_per_fold)
avg_acc = np.mean(acc_per_fold)
print(f'Average loss across all folds: {avg_loss}')
print(f'Average accuracy across all folds: {avg_acc}%')
"""

In [None]:
"""
best_model.fit(x_train, labels_train, epochs=10, batch_size=32, 
               validation_data=(x_valid, labels_valid), 
               callbacks=[early_stopping], 
               verbose=1)
"""

# Predict the test set. and create CSV

In [23]:
predictions = best_model.predict(x_test)
predicted_classes = np.argmax(predictions, axis=1)
predicted_labels = label_encoder.inverse_transform(predicted_classes)



In [24]:
data_path = "/home/jovyan/Documents/MSBA/unstrucured_data/mgta-415-winter2024/"
dic = {"Id": [], "Predicted": []}
for i, pred in enumerate(predicted_labels):
    dic["Id"].append(i)
    dic["Predicted"].append(pred)

dic_df = pd.DataFrame.from_dict(dic)
dic_df.to_csv(data_path + "predicted_nn.csv", index=False)