In [None]:
import pandas as pd
import numpy as np
import nltk
import seaborn as sns
import time
import re
import random
from keras import regularizers


# Initialisation du générateur de nombres aléatoires
random.seed(123)

tokenizer = nltk.RegexpTokenizer(r'''\w'|\w+|[^\w\s]''')

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense,Dropout,Embedding,LSTM,BatchNormalization
# from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping


In [None]:
import nltk

# Télécharge les données nécessaires pour le tokenizer (segmenteur) de phrases et de mots
nltk.download('punkt')

# Télécharge la liste de mots vides (stopwords) pour différentes langues
nltk.download('stopwords')

In [None]:
train_data = pd.read_csv('cleaned_data/train_cleaned.csv',index_col=0)
dev_data   = pd.read_csv('cleaned_data/dev_cleaned.csv',index_col=0)
test_data  =  pd.read_csv('cleaned_data/test_cleaned.csv',index_col=0)

In [None]:
# GET ONLY 1/4 OF TRAIN AND DEV DATA DUE TO COMPUTATION RESOURCES LIMITATIONS
# train_data = train_data.head(200)
# dev_data = dev_data.head(50)


In [None]:
train_data['note'].value_counts()

In [None]:
dev_data['note'].value_counts()

In [None]:
# get the same minority classe size in all other classes in train and dev data 

from sklearn.utils import resample, class_weight


# Calculez la taille de la classe minoritaire
train_minority_class_size = min(train_data['note'].value_counts())
dev_minority_class_size = min(dev_data['note'].value_counts())

# Sous-échantillonnez les classes majoritaires pour avoir la même taille que la classe minoritaire
train_data = pd.concat([
    resample(train_data[train_data['note'] == note], replace=True, n_samples=train_minority_class_size)
    for note in train_data['note'].unique()
])

dev_data = pd.concat([
    resample(dev_data[dev_data['note'] == note], replace=True, n_samples=dev_minority_class_size)
    for note in dev_data['note'].unique()
])


In [None]:
train_data['note'].value_counts()

In [None]:
dev_data['note'].value_counts()

In [None]:
train_data['commentaire'] = train_data['commentaire'].astype(str)
dev_data['commentaire'] = dev_data['commentaire'].astype(str)
test_data['commentaire'] = test_data['commentaire'].astype(str)

In [None]:
train_comments_list = train_data['commentaire'].values.tolist()
dev_comments_list  = dev_data['commentaire'].values.tolist()
test_comments_list  = test_data['commentaire'].values.tolist()

In [None]:
corpus_all_comments = train_comments_list + dev_comments_list + test_comments_list

In [None]:
corpus_all_comments

# Corpus Tokenization

In [None]:
from nltk.tokenize import word_tokenize

tokens = [word_tokenize(sentence) for sentence in corpus_all_comments]
tokens

# W2V_Skip-gram

In [None]:
#  objectif : Prédire le contexte (les mots qui entourent) à partir d'un mot cible.
#  Utilisation : Performant pour des corpus textuels où le sens des mots est relativement stable sur de courtes fenêtres.

In [None]:
# from gensim.models import Word2Vec

# # Modèle Word2Vec avec Skip-gram
# Word2Vec_model_skipgram = Word2Vec(sentences=tokens, vector_size=100, window=5, sg=1, min_count=1, epochs=10)

# # Sauvegarde du modèle
# Word2Vec_model_skipgram.save("Word2Vec_Skipgram.model")


In [None]:
#!pip install gensim

# W2V_CBOW

In [None]:
#  Prédire un mot cible à partir de son contexte (les mots qui l'entourent).
#  Utilisation : Performant pour des corpus textuels où le sens des mots est relativement stable sur de courtes fenêtres. 

In [None]:
from gensim.models import Word2Vec

Word2Vec_model_cbow = Word2Vec(tokens, vector_size=100, min_count=1, epochs=10, sg=0)

Word2Vec_model_cbow.save("Word2Vec_CBOW.model")

In [None]:
Word2Vec_CBOW = Word2Vec.load("Word2Vec_CBOW.model")

In [None]:
vector = Word2Vec_CBOW.wv['film']  # get numpy vector of a 'film' word
vector

In [None]:
def limit_comment_length(comment, max_words=100):
    # Diviser le commentaire en mots
    words = comment.split()
    
    # Limiter le nombre de mots
    limited_words = words[:max_words]
    
    # Rejoindre les mots pour former un nouveau commentaire
    limited_comment = ' '.join(limited_words)
    
    return limited_comment

# Appliquer la fonction à chaque commentaire dans train_comments_list et dev_comments_list
train_list = [limit_comment_length(comment) for comment in train_comments_list if comment]
dev_list = [limit_comment_length(comment) for comment in dev_comments_list if comment]


In [None]:
def count_words(message):
    # Diviser la chaîne en une liste de mots
    words = message.split()
    
    # Compter le nombre de mots
    num_words = len(words)
    
    return num_words

# Exemple d'utilisation
result = count_words(train_list[0])

print("Nombre de mots dans le message :", result)


In [None]:
train_data['nouvelle commentaire'] = train_list
dev_data['nouvelle commentaire']   = dev_list

In [None]:
train_data['nouvelle commentaire']

In [None]:
# train data sise : 665962

In [None]:
X_train = train_data['nouvelle commentaire']
X_dev   = dev_data['nouvelle commentaire']

# return the most 'len(X_train) = 665962' frequent words id
tokenizer = Tokenizer(num_words=len(list(X_train.unique())))

# entrainer le tokenizer
tokenizer.fit_on_texts(list(X_train))

#mapping of words to ids for the entire text corpus
word_index = tokenizer.word_index

# Return a vector of ids of the most 'len(X_train) = 665962' in each comment
X_train = tokenizer.texts_to_sequences(X_train) 
X_dev = tokenizer.texts_to_sequences(X_dev)

# Ensure that all sequences in a list have the same length, if not add 0 at begining
X_train = pad_sequences(X_train, maxlen=100)
X_dev = pad_sequences(X_dev, maxlen=100)

# save tokenizer

In [None]:
import pickle

# Sauvegarder le tokenizer
with open('Cnn_tokenizer.pkl', 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)


In [None]:
word_index

In [None]:
X_train

In [None]:
vocab_size = len(word_index) + 1
vocab_size

In [None]:
import numpy as np

# Return a matix of vocab words embedding based using Word2Vec_CBOW 
def create_embedding_matrix(tokenizer, word2vec_model, embedding_dim):
    vocab_size = len(tokenizer.word_index) + 1
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    for word, i in tokenizer.word_index.items():
        try:
            vector = word2vec_model.wv[word]
        except KeyError:
            # Word not found in Word2Vec, keep the embedding as zeros
            continue
        embedding_matrix[i] = vector

    return embedding_matrix

In [None]:
embedding_matrix = create_embedding_matrix(tokenizer, Word2Vec_CBOW, 100)
embedding_matrix

In [None]:
y_train = np.array(train_data['note'])
y_train = y_train.astype(int)
y_train

In [None]:
y_dev = np.array(dev_data['note'])
y_dev = y_dev.astype(int)
y_dev

# CNN + Word2Vec_CBOW

In [None]:
classes_num = 10

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv1D, GlobalMaxPool1D

dropout_rate=0.25

# Create a sequential model
model = Sequential()

# Add an Embedding layer to the model
model.add(Embedding(
    input_dim=vocab_size,         # Vocabulary size, total number of unique words in the data
    output_dim=100,               # Dimension of the embedding space, each word represented by a vector of 100 dimensions
    input_length=100,             # Length of each input sequence (comment), limited to 100 words
    weights=[embedding_matrix],   # Initialize the embedding layer with pre-trained weights from embedding_matrix
    trainable=True                # Allow the weights of the embedding layer to be trainable during model training
))

model.add(Conv1D(filters=128, kernel_size=5, activation='relu',kernel_regularizer=regularizers.l2(l=0.01)))
model.add(BatchNormalization())
model.add(GlobalMaxPool1D())
model.add(Dropout(dropout_rate))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(dropout_rate))
model.add(Dense(10, activation='softmax'))



# IMPROVEMENT
# give equal class weights
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
# Convert class weights to a dictionary for use with Keras
class_weight_dict = dict(enumerate(class_weights))


# Compile the model
#  sparse_categorical_crossentropy loss function used during training. For a classification task with integer labels (like 0, 1, 2),
# for that i converted nmarks from 0.5,1...5 to 0,1...9
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()




# Specify the file path where the best model will be saved
filepath = 'CNN_Word2Vec_SkipGram_Best_One_improved.hdf5'

# Create callbacks
checkpoint = ModelCheckpoint(
    filepath=filepath,
    monitor='val_accuracy',
    verbose=1,
    save_best_only=True,
    mode='max'
)

early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=2,
    restore_best_weights=True
)

callbacks = [checkpoint, early_stopping]

start_time = time.time()


# Train for more epochs
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_dev, y_dev),
    epochs=20,  # Increase the number of epochs
    batch_size=32,  # Experiment with different batch sizes
    verbose=1,
    callbacks=callbacks,
    class_weight=class_weight_dict
)

execution_time = time.time() - start_time

print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(execution_time)))




# save train and validation accuracy/loss

In [None]:
import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# Plot accuracy
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.savefig('Cnn_accuracy_plot.png')  # Save the plot as an image

# Clear the figure for the next plot
plt.figure()

# Plot loss
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.savefig('Cnn_loss_plot.png')  # Save the plot as an image


In [None]:
# # Charger le tokenizer depuis le fichier
# with open('Cnn_tokenizer.pkl', 'rb') as tokenizer_file:
#     loaded_tokenizer = pickle.load(tokenizer_file)

# Test and saving predictions

In [None]:
# from tensorflow.keras.models import load_model

# # model = load_model('MLP_Best_One.hdf5'),this command doesn't work for me, i had to change the saved model path

# modelpath = "C:/trained_Models/CNN_Word2Vec_SkipGram_Best_One_improved.hdf5"

# # Load the model with compile=False
# CNN_Word2Vec_SkipGram_Best_One = load_model(modelpath)


In [None]:
# model

In [None]:
# test_data['commentaire'] = test_data['commentaire'].astype(str)
# X_test = test_data['commentaire']

In [None]:
# # X_test = tokenizer.texts_to_sequences(X_test)

# X_test = loaded_tokenizer.texts_to_sequences(X_test)
# X_test = pad_sequences(X_test, maxlen=100)

In [None]:
# predictions = CNN_Word2Vec_SkipGram_Best_One.predict(X_test)
# predictions

In [None]:
# argmax_predictions = np.argmax(predictions,axis =1)
# argmax_predictions

In [None]:
# #  generate the plateform test data format        
# with open("CNN_Word2Vec_SkipGram_ID_Prediction.txt", "w") as f:
#     for i in range(len(test_data['review_id'])):
#         prediction = (argmax_predictions[i] + 1) / 2
#         line = f"{test_data['review_id'].iloc[i]} {str(prediction).replace('.', ',')}\n"
#         f.write(line)