In [1]:
import tensorflow as tf 
print(tf.__version__)

2.12.0


# Preprocessing of data 

In this part we will get the necessary data for training/test/validation parts and check if the data has : 
- zeros (lack of value) 
- shape of data 

In [3]:
import pandas as pd



train=pd.read_table('../data/train.txt', delimiter =';', header=None, )
val=pd.read_table('../data/val.txt', delimiter =';', header=None, )
test=pd.read_table('../data/test.txt', delimiter =';', header=None, )

data = pd.concat([train ,  val , test])
data.columns = ["text", "label"]
print(data)



                                                   text    label
0                               i didnt feel humiliated  sadness
1     i can go from feeling so hopeless to so damned...  sadness
2      im grabbing a minute to post i feel greedy wrong    anger
3     i am ever feeling nostalgic about the fireplac...     love
4                                  i am feeling grouchy    anger
...                                                 ...      ...
1995  i just keep feeling like someone is being unki...    anger
1996  im feeling a little cranky negative after this...    anger
1997  i feel that i am useful to my people and that ...      joy
1998  im feeling more comfortable with derby i feel ...      joy
1999  i feel all weird when i have to meet w people ...     fear

[20000 rows x 2 columns]


In [4]:
data.shape

(20000, 2)

In [5]:
data.isna().any(axis=1).sum()

0

## Stemming of the text

Here we need to preprocess each line of text using steeming 

In [6]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re 

#text preprocessing
ps = PorterStemmer()

def preprocess(line):
    review = re.sub('[^a-zA-Z]', ' ', line) #leave only characters from a to z
    review = review.lower() #lower the text
    review = review.split() #turn string into list of words
    #apply Stemming 
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')] #delete stop words like I, and ,OR   review = ' '.join(review)
    #trun list into sentences
    return " ".join(review)

We apply the stemming process over all the data 

In [7]:
data['text']=data['text'].apply(lambda x: preprocess(x))

Now we will transform categorical labels into numerical labels.

In [9]:
from sklearn import preprocessing  # Importing the preprocessing module from scikit-learn to use preprocessing tools

# Creating a LabelEncoder object to convert categorical labels into numerical labels
label_encoder = preprocessing.LabelEncoder()

# Using the encoder to transform the data in the 'label' column into numerical values
# 'fit_transform' learns the unique categories and converts them into integers
data['N_label'] = label_encoder.fit_transform(data['label'])

# A new column 'N_label' is added to the DataFrame 'data', containing the encoded labels as numbers


In [10]:
data

Unnamed: 0,text,label,N_label
0,didnt feel humili,sadness,4
1,go feel hopeless damn hope around someon care ...,sadness,4
2,im grab minut post feel greedi wrong,anger,0
3,ever feel nostalg fireplac know still properti,love,3
4,feel grouchi,anger,0
...,...,...,...
1995,keep feel like someon unkind wrong think get b...,anger,0
1996,im feel littl cranki neg doctor appoint,anger,0
1997,feel use peopl give great feel achiev,joy,2
1998,im feel comfort derbi feel though start step s...,joy,2


In [11]:
# Importing CountVectorizer from scikit-learn to create a Bag of Words (BoW) model
# BoW converts textual data into numerical data by counting word occurrences
from sklearn.feature_extraction.text import CountVectorizer

# Initializing CountVectorizer with specific parameters:
# - max_features=5000: Limit the vocabulary to the 5000 most frequent words or n-grams
# - ngram_range=(1,3): Extract unigrams (single words), bigrams (two consecutive words), and trigrams (three consecutive words)
cv = CountVectorizer(max_features=5000, ngram_range=(1, 3))

# Example: For the text "the course was long", the resulting n-grams will include:
# ['the', 'the course', 'the course was', 'course', 'course was', 'course was long', 'was', 'was long', 'long']

# Transforming the 'text' column in the dataset into a numerical matrix:
# - fit_transform learns the vocabulary from the text and transforms each document into a numerical vector
# - toarray converts the sparse matrix into a dense NumPy array
data_cv = cv.fit_transform(data['text']).toarray()

# The resulting 'data_cv' is a matrix where:
# - Each row corresponds to a document in the dataset
# - Each column corresponds to a word or n-gram in the vocabulary
# - The values represent the count of occurrences of the word or n-gram in the document


We split data into two sets : training and test

In [12]:
from sklearn.model_selection import train_test_split

#X_train, X_test, y_train, y_test=data_cv,test_cv,train['N_label'],test['N_label']
X_train, X_test, y_train, y_test =train_test_split(data_cv, data['N_label'], test_size=0.25, random_state=42)

## Naive model
We will in this model doesn't try to search for the best hyperparameters

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Dense)

# load the dataset
# split into input (X) and output (y) variables
# define the keras model
model = Sequential()
model.add(Dense(12, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(6, activation='softmax'))
# compile the keras model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model.fit(X_train, y_train, epochs=10, batch_size=10)
# evaluate the keras model
_, accuracy = model.evaluate(X_train, y_train)
print('Accuracy: %.2f' % (accuracy*100))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 99.59


In [14]:
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 85.28


In [15]:
import numpy as np

text='I feel Happy'
text=preprocess(text)
array = cv.transform([text]).toarray()
pred = model.predict(array)
a=np.argmax(pred, axis=1)
label_encoder.inverse_transform(a)[0]



'joy'

We save the naive model

In [50]:
import os

# create the repertory if needed
if not os.path.exists('models'):
    os.makedirs('models')

# save the model in the models repertory
tf.keras.models.save_model(model, 'models/naive_model.h5')


In [54]:
import pickle
pickle.dump(label_encoder, open('../utils/encoder.pkl', 'wb'))
pickle.dump(cv, open('../utils/CountVectorizer.pkl', 'wb'))

# Study on the number of word to obtain a good guess

In [18]:
# Fonction pour obtenir la prédiction du modèle
def model_predict(text):
    text = preprocess(text)  # Prétraitement
    array = cv.transform([text]).toarray()  # Transformation en vecteur
    pred = model.predict(array,verbose=0)  # Prédiction
    a = np.argmax(pred, axis=1)  # Obtenir l'index de la classe prédite
    return label_encoder.inverse_transform(a)[0]  # Décoder la classe prédite

# Fonction pour tester à partir de combien de mots le modèle devine correctement
def test_prediction_threshold(dataframe):
    results = []
    for _, row in dataframe.iterrows():
        text = row['text']
        true_label = row['label']
        words = text.split()
        correct_at = None  # Enregistre le seuil où la prédiction est correcte

        for i in range(1, len(words) + 1):
            subset = " ".join(words[:i])  # Texte partiel avec i mots
            prediction = model_predict(subset)
            if prediction == true_label:
                correct_at = i
                break

        results.append({
            "text": text,
            "label": true_label,
            "pred_correct_at_words": correct_at
        })

    return pd.DataFrame(results)




# Appliquer la fonction au DataFrame
#results_df = test_prediction_threshold(data)
# Appliquer la fonction uniquement à la première ligne du DataFrame
rows = data.iloc[:1000]  # Extraire la première ligne

# Passer la première ligne à la fonction
result_rows = test_prediction_threshold(rows)

# Afficher le résultat
print(result_rows)


# Afficher les résultats
#print(results_df)

                                                  text    label  \
0                                    didnt feel humili  sadness   
1    go feel hopeless damn hope around someon care ...  sadness   
2                 im grab minut post feel greedi wrong    anger   
3       ever feel nostalg fireplac know still properti     love   
4                                         feel grouchi    anger   
..                                                 ...      ...   
995                         depress actual feel inspir      joy   
996  feel like enough peopl age actual think pretti...  sadness   
997           get home laze around pajama feel grouchi    anger   
998                       feel pretti homesick weekend  sadness   
999  start feel realli optimist driven paper coz go...      joy   

     pred_correct_at_words  
0                      3.0  
1                      3.0  
2                      1.0  
3                      3.0  
4                      2.0  
..                   

In [19]:
# Sauvegarder les résultats en CSV
result_rows.to_csv('../data/result_rows.csv', index=False)

print("results have been saved in 'result_rows.csv'.")

results have been saved in 'result_rows.csv'.


## K cross validation model


In [42]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.base import BaseEstimator, ClassifierMixin
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Fonction pour créer un modèle Keras
def create_model(optimizer='adam', neurons=12):
    model_local = Sequential()
    model_local.add(Dense(neurons, input_shape=(X_train.shape[1],), activation='relu'))
    model_local.add(Dense(8, activation='relu'))
    model_local.add(Dense(6, activation='softmax'))
    model_local.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model_local

# Créer une classe personnalisée avec les mixins de scikit-learn
class CustomKerasClassifier(KerasClassifier, BaseEstimator, ClassifierMixin):
    """A KerasClassifier that explicitly inherits from BaseEstimator and ClassifierMixin."""
    pass

# Créer une instance de la classe personnalisée
model = CustomKerasClassifier(model=create_model, epochs=10, batch_size=10, verbose=0)

# Définir la grille d'hyperparamètres à rechercher
param_grid = {
    'model__optimizer': ['adam', 'sgd'],  # Optimizers à tester
    'model__neurons': [12, 14, 16],        # Nombre de neurones dans la première couche
    'batch_size': [10, 20],               # Tailles de batch à tester
    'epochs': [10, 20],                   # Nombres d'époques
}

# Définir le nombre de K pour la cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Recherche des meilleurs hyperparamètres avec GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=kfold, verbose=2)

# Entraîner le modèle avec la recherche de grille
grid_result = grid.fit(X_train, y_train)

# Afficher les meilleurs hyperparamètres trouvés
print("Best Hyperparameters:", grid_result.best_params_)

# Évaluer le modèle avec les meilleurs hyperparamètres
best_model = grid_result.best_estimator_
accuracy = best_model.score(X_test, y_test)
print(f"Test accuracy: {accuracy*100:.2f}%")


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Hyperparameters: {'model__neurons': 12, 'model__optimizer': 'sgd'}


AttributeError: 'CustomKerasClassifier' object has no attribute 'evaluate'

In [43]:
accuracy = best_model.score(X_test, y_test)
print(f"Test accuracy: {accuracy*100:.2f}%")

Test accuracy: 86.44%


In [41]:
import numpy as np

text='I am sad'
text=preprocess(text)
array = cv.transform([text]).toarray()
pred = best_model.predict(array)
label_encoder.inverse_transform(a)[0]

'joy'