In [83]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec

In [84]:
df=pd.read_csv('/content/movie_review.csv')

In [85]:
def preprocess_text(text):
    # Convertir en minuscules
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Supprimer la ponctuation
    tokens = [word for word in tokens if word not in string.punctuation]

    # Chargement des mots vides (stop words)
    stop_words = set(stopwords.words('english'))

    # Supprimer les mots vides
    tokens = [word for word in tokens if word not in stop_words]

    return tokens


In [86]:
df['preprocessed_text']=df['text'].apply(preprocess_text)
df['preprocessed_text']

0        [films, adapted, comic, books, plenty, success...
1        [starters, created, alan, moore, eddie, campbe...
2        [say, moore, campbell, thoroughly, researched,...
3        [book, ``, graphic, novel, ``, 500, pages, lon...
4                      [words, n't, dismiss, film, source]
                               ...                        
64715    [lack, inspiration, traced, back, insipid, cha...
64716    [like, many, skits, current, incarnation, _sat...
64717    [watching, one, ``, roxbury, ``, skits, snl, c...
64718                      [bump, unsuspecting, women, 's]
64719    [watching, _a_night_at_the_roxbury_, 'll, left...
Name: preprocessed_text, Length: 64720, dtype: object

In [87]:
# Entraîner le modèle Word2Vec
word2vec_model = Word2Vec(sentences=df['preprocessed_text'], vector_size=100, window=5, min_count=1, workers=4)

In [88]:
# Fonction pour vectoriser les critiques de films
def vectorize_review(review, model):
    word_vectors = []
    for word in review:
        if word in model.wv:
            word_vectors.append(model.wv[word])
    if len(word_vectors) > 0:
        review_vector = np.mean(word_vectors, axis=0)
        return review_vector
    else:
        return None

# Vectoriser les critiques de films
df['review_vector'] = df['preprocessed_text'].apply(lambda x: vectorize_review(x, word2vec_model))

# Supprimer les lignes pour lesquelles la vectorisation a échoué
df.dropna(subset=['review_vector'], inplace=True)

# Afficher les premières lignes du DataFrame avec les reviews vectorisées

df['review_vector']

0        [-0.477141, 0.7070446, 0.41120675, 0.1622094, ...
1        [-0.26166272, 0.52340794, 0.2711009, 0.3359048...
2        [-0.33333933, 0.78850126, 0.38362736, 0.195701...
3        [-0.52882695, 0.6873177, 0.4559792, 0.3531918,...
4        [-0.59851027, 1.0088172, 0.12203376, -0.166611...
                               ...                        
64715    [0.0205277, 0.63933307, 0.061103567, -0.115975...
64716    [-0.39885756, 0.67953736, 0.41460565, 0.179713...
64717    [-0.74225986, 1.0223514, 0.846617, 0.34616452,...
64718    [-0.2873286, 0.5512699, 0.46645766, 0.17928302...
64719    [-0.6013006, 0.92944163, 0.38602832, -0.134092...
Name: review_vector, Length: 64423, dtype: object

In [89]:
# Diviser le dataset en ensembles d'entraînement et de test
X = df['review_vector'].to_numpy()  # Features (vecteurs de critiques)
y = df['tag']  # Labels (étiquettes de sentiment)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [90]:
# Entraîner un modèle de Logistic Regression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train.tolist(), y_train)

In [91]:
# Faire des prédictions sur l'ensemble de test
y_pred = clf.predict(X_test.tolist())

In [92]:

# Évaluer les performances du modèle
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Accuracy: 0.5660845944897167
Precision: 0.5690909445895584
Recall: 0.5660845944897167
F1-score: 0.558972599669998
