# in this second version of LSTM I will 

      1° balance train and dev data 
      2° give equal weights to all target classes
      3° increase training  epochs
      4° decrease training  batch size

In [None]:
import pandas as pd
import numpy as np
import nltk
import seaborn as sns
import time
import re
import random

# Initialisation du générateur de nombres aléatoires
random.seed(123)

tokenizer = nltk.RegexpTokenizer(r'''\w'|\w+|[^\w\s]''')

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Reshape, SpatialDropout1D
# from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.utils import class_weight



In [None]:
import nltk

# Télécharge les données nécessaires pour le tokenizer (segmenteur) de phrases et de mots
nltk.download('punkt')

# Télécharge la liste de mots vides (stopwords) pour différentes langues
nltk.download('stopwords')

In [None]:
train_data = pd.read_csv('cleaned_data/train_cleaned.csv',index_col=0)
dev_data   = pd.read_csv('cleaned_data/dev_cleaned.csv',index_col=0)
test_data  =  pd.read_csv('cleaned_data/test_cleaned.csv',index_col=0)

In [None]:
print("train_data shape",train_data.shape)
print("dev_data shape",dev_data.shape)
print("test_data shape",test_data.shape)

In [None]:
# GET ONLY 1/4 OF TRAIN AND DEV DATA DUE TO COMPUTATION RESOURCES LIMITATIONS
# train_data = train_data.head(200)
# dev_data = dev_data.head(50)


# Balancing label classes

In [None]:
train_data['note'].value_counts()

In [None]:
dev_data['note'].value_counts()

In [None]:
# get the same minority classe size in all other classes in train and dev data 

from sklearn.utils import resample, class_weight


# Calculez la taille de la classe minoritaire
train_minority_class_size = min(train_data['note'].value_counts())
dev_minority_class_size = min(dev_data['note'].value_counts())

# Sous-échantillonnez les classes majoritaires pour avoir la même taille que la classe minoritaire
train_data = pd.concat([
    resample(train_data[train_data['note'] == note], replace=True, n_samples=train_minority_class_size)
    for note in train_data['note'].unique()
])

dev_data = pd.concat([
    resample(dev_data[dev_data['note'] == note], replace=True, n_samples=dev_minority_class_size)
    for note in dev_data['note'].unique()
])


In [None]:
train_data['note'].value_counts()

In [None]:
dev_data['note'].value_counts()

In [None]:
train_data['commentaire'] = train_data['commentaire'].astype(str)
dev_data['commentaire']   = dev_data['commentaire'].astype(str)
test_data['commentaire']  = test_data['commentaire'].astype(str)

# Data Split

In [None]:
X_train = train_data['commentaire']
X_dev = dev_data['commentaire']
X_test = test_data['commentaire']

y_train = train_data['note']
y_dev = dev_data['note']

In [None]:
y_train.unique()

In [None]:
y_train

# Y one hot encoding

In [None]:
y_train_one_hotencoding = to_categorical(y_train, num_classes=10)
print(y_train_one_hotencoding[:10])

In [None]:
y_dev_one_hotencoding = to_categorical(y_dev, num_classes=10)
print(y_dev_one_hotencoding[:10])

In [None]:
len(list(X_train.unique()))

# Tokenization

In [None]:
n_most_common_words = 10000
max_len = 180

tokenizer = Tokenizer(num_words=n_most_common_words)
tokenizer.fit_on_texts(list(X_train))

X_train = tokenizer.texts_to_sequences(X_train) 
X_dev = tokenizer.texts_to_sequences(X_dev)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))


X_train = pad_sequences(X_train, maxlen=max_len)
X_dev = pad_sequences(X_dev, maxlen=max_len)

# Save tokenizer

In [None]:
import pickle

# Sauvegarder le tokenizer
with open('Lstm_tokenizer.pkl', 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)


# Training

In [None]:
epochs  = 20
emb_dim = 128
batch_size = 32
# batch_size = 128

In [None]:
print((X_train.shape, y_train_one_hotencoding.shape, X_dev.shape, y_dev_one_hotencoding.shape))


In [None]:
# Model definition
model = Sequential()
model.add(Embedding(n_most_common_words, emb_dim, input_length=X_train.shape[1]))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=False))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(10, activation='softmax'))


# IMPROVEMENT
# give equal class weights
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
# Convert class weights to a dictionary for use with Keras
class_weight_dict = dict(enumerate(class_weights))


# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Create callbacks
filepath = 'Lstm_best_model_improved.hdf5'
checkpoint = ModelCheckpoint(
    filepath=filepath,
    monitor='val_accuracy',
    verbose=1,
    save_best_only=True,
    mode='max'
)

early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=3,  # Increased patience
    restore_best_weights=True
)

callbacks = [checkpoint, early_stopping]

# Training
start_time = time.time()
history = model.fit(X_train, y_train_one_hotencoding, epochs=epochs, batch_size=batch_size,
                    validation_data=(X_dev, y_dev_one_hotencoding), verbose=1, callbacks=callbacks, class_weight=class_weight_dict)
execution_time = time.time() - start_time

print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(execution_time)))

# save train and validation accuracy/loss

In [None]:
import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# Plot accuracy
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.savefig('accuracy_plot.png')  # Save the plot as an image

# Clear the figure for the next plot
plt.figure()

# Plot loss
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.savefig('loss_plot.png')  # Save the plot as an image


# Test and saving results

In [None]:
# # Charger le tokenizer depuis le fichier
# with open('Lstm_tokenizer.pkl', 'rb') as tokenizer_file:
#     loaded_tokenizer = pickle.load(tokenizer_file)

In [None]:
# from tensorflow.keras.models import load_model

# # model = load_model('MLP_Best_One.hdf5'),this command doesn't work for me, i had to change the saved model path

# # modelpath = "C:/trained_Models/Lstm_best_model_improved.hdf5"


# # Load the model with compile=False
# Lstm_best_model = load_model(modelpath)

# X_test = test_data['commentaire']
# # X_test = tokenizer.texts_to_sequences(X_test)
# X_test = loaded_tokenizer.texts_to_sequences(X_dev)
# X_test = pad_sequences(X_test, maxlen=max_len)

# predictions = Lstm_best_model.predict(X_test)
# argmax_predictions = np.argmax(predictions,axis =1)

# print("argmax_predictions: ",argmax_predictions)
 
# # # generate the plateform test data format        
# with open("LSTM_ID_Prediction_improved.txt", "w") as f:
#     for i in range(len(test_data['review_id'])):
#         prediction = (argmax_predictions[i] + 1) / 2
#         line = f"{test_data['review_id'].iloc[i]} {str(prediction).replace('.', ',')}\n"
#         f.write(line)

      