In [None]:
#import packages


import time
import re
import logging
import warnings
import nltk
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer 
from gensim.models import word2vec
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

In [None]:
nltk.download('popular')
warnings.filterwarnings('ignore')

 # **1. Dataset exploration and preprocessing**

In [None]:
path_train_data = r'../input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip'
df = pd.read_csv(path_train_data, delimiter = '\t')

 # # 1.1 Exploring the dataset

In [None]:
df.head()

In [None]:
df.shape

In [None]:
print('Exemple d\'une critique de film :\n')
print(df['review'][0])
sent = df['sentiment'][0]
print(f'\nlabel de cette critique :{sent}\
\n\nRappel : 1 = positif, 0 = négatif')

In [None]:
print(len(df['review'][0]))

In [None]:
df_copy = df.copy()

df_copy['lenght'] = df['review']

In [None]:
df_copy['lenght'] = df_copy['lenght'].apply(lambda x : len(x))
df_copy.head()

In [None]:
#uniformisation des légendes matplotblib

font_title = {'color' : 'steelblue',
        'weight' : 'normal',
        'size' : 16}

font_label = {'color' : 'darkslategray',
        'weight' : 'normal',
        'size' : 12}

In [None]:
fig = plt.figure(1, figsize=(16, 9))

cm = plt.cm.get_cmap('Accent')

n, bins, patches = plt.hist(df_copy['lenght'],bins = 50)
bin_centers = 0.5 * (bins[:-1] + bins[1:])


for i, p in enumerate(patches):
    plt.setp(p, 'facecolor', cm(i/25)) 
plt.title('Distribution de la longueur des critiques \
(en mots) sur le dataset d\'entraînement', 
          fontdict = font_title)

plt.xlabel('longueur des critiques en mots',
           fontdict = font_label)
plt.ylabel('Nombres de critiques', 
           fontdict = font_label)

plt.show()

In [None]:
#Répartition des labels 

NB_DONNEES = 25000

x = [np.sum(df['sentiment']),
     NB_DONNEES - np.sum(df['sentiment'])]

labels = ['critique positive', 'critique négative' ]

colors = ['seagreen', 'firebrick']

plt.figure(1, figsize = (16,9))

plt.pie(x, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90, colors = colors)

plt.title(f'Répartition des critiques de films '
          f'positives et négatives pour '
          f'{NB_DONNEES} critiques', fontdict = font_title)
          
plt.show()

In [None]:
plt.figure(1, figsize=(16,9))

sns.kdeplot(df_copy[df_copy['sentiment']==0]['lenght'],color='firebrick')

sns.kdeplot(df_copy[df_copy['sentiment']==1]['lenght'],color='seagreen')

plt.legend(['critique négative','critique positive'])

plt.title('Répatition des longueurs des critiques en fonction de leur labélisation',
          fontdict = font_title)

plt.xlabel('longueur des critiques en mots',
           fontdict = font_label)

plt.ylabel('Densité', fontdict = font_label)

# # 1.2 Preprocessing

In [None]:
J = 6

df['review'][J]  #Exemple d'une critique

## # # 1.2.1 suppression des balises HTML

In [None]:
#Suppression des balises HTML 
'''
remove_balise_html(html_doc, parser = 'html.parser')
remove html balises on an str file
args : 
    - html_doc = html document to clean
    - parser
'''


def remove_balise_html(html_doc,parser='html.parser'):
    cleantext = BeautifulSoup(html_doc, parser).text
    return cleantext

In [None]:
df['review'] = df['review'].apply(remove_balise_html)


df['review'][J]

# # # 1.2.2 Stopwords

In [None]:
df['review'] = df['review'].apply(lambda row : remove_stopwords(row))

df['review'][J]

# # # 1. 2.3 Toekenization

In [None]:
df['review'] = df['review'].apply(word_tokenize)

print(df['review'][J])

# # # 1.2.4 Normalisation

In [None]:
'''
function to clean a list containing the words of the sentence.
- Replace capitals letters
- remove punctuation
- remove digits

arg : 
    t : a list of strings 
'''


def clean_txt(t):
    
    for i in range(len(t)) :
        
        t[i] = t[i].lower()
        t[i] = re.sub(r'[^\w\s]','',t[i])
        t[i] = ''.join([j for j in t[i] if not j.isdigit()])
        
    return t 

In [None]:
df['review'] = df['review'].apply(lambda row : clean_txt(row))

df['review'] = df['review'].apply(lambda row : list(filter(None,row))) #retrait des éléments vides

In [None]:
print(df['review'][J])

# # # 1. 2.5 Lemmatisation

In [None]:
lemmatizer = WordNetLemmatizer()
    
df_final = df['review'].apply(lambda row : ' '.join([lemmatizer.lemmatize(w) for w in row]))
df_final = df_final.apply(lambda row : row.split())

In [None]:
print(df['review'][J])
print('\n')
print(df_final[J])

In [None]:
df['review'] = df_final

# 2. **Utilisation du modèle Word2Vec**

Choix des paramètres pour l'entraînement du modèle Word2Vec

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40  # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [None]:
X = df['review']

Y = df['sentiment']

# # 2.1 Entraînement du modèle

In [None]:
print ("Training model...")
model = word2vec.Word2Vec(X, workers=num_workers,  min_count = min_word_count, \
            window = context, vector_size = num_features, sample = downsampling, sg = 1)


# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "My_Word2Vec"
model.save(model_name)
print("Model Trained !")

In [None]:
keys_vocab = model.wv.key_to_index.keys()
print((len(keys_vocab)))

# # 2.2 Exemple sur un mot

In [None]:
word = 'publicity'

In [None]:
normed_vector = model.wv.get_vector(word, norm=True)

print(normed_vector.shape)

shape_normed_vector = 300

print(f'\n10 premières composantes du vecteurs du mot \'{word}\' normalisées : \n')
print(normed_vector[0:10])

In [None]:
model.wv.most_similar(word)

In [None]:
ex_l = model.wv.most_similar(word)

ex_l = np.array(ex_l)

ex = ex_l[:,0]

ex_array = np.zeros((len(ex_l),shape_normed_vector))

for i in range (len(ex_l)):
    ex_array[i,:] = model.wv.get_vector(ex[i])

In [None]:
fitter = normed_vector.reshape(2,int(shape_normed_vector/2))

pca = PCA(n_components=2)
fitter  = pca.fit_transform(fitter)


In [None]:
ex_plot = []

for i in range (len(ex_l)):
    ex_array[i,:] = model.wv.get_vector(ex[i])
    l = ex_array[i,:].reshape(2,int(shape_normed_vector/2))
    ex_plot.append((pca.transform(l)).mean(axis = 0))
    

ex_plot.append(fitter.mean(axis=0))

labels = [e for e in ex]

labels.append(word)



In [None]:
fig = plt.figure(1, figsize=(16, 9))


c = ['b' for i in range(10)]  #colors
c.append('r')

ex_plot = np.array(ex_plot)
plt.scatter(ex_plot[:,0],ex_plot[:,1],s = 100, c = c)


for i, label in enumerate(labels):
    plt.annotate(label, (ex_plot[i,0], ex_plot[i,1]), fontsize = 20)
plt.title(f'Scatter plot des mots les plus proches du mot \'{word}\'',\
          fontdict = font_title)

plt.xlabel('X', fontdict = font_label)
plt.ylabel('Y', fontdict = font_label)
fig.show()

# 3. Predictions

In [None]:
def get_vect(word, model):
    try:
        return model.wv[word]
    except KeyError:
        return np.zeros((model.vector_size,))

In [None]:
def sum_vectors(phrase, model):
    return sum(get_vect(w, model) for w in phrase)

In [None]:
def word2vec_features(X, model):
    feats = np.vstack([sum_vectors(p, model) for p in X])
    return feats

In [None]:
wv_train_feat = word2vec_features(df["review"], model)
wv_train_feat.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(wv_train_feat, df['sentiment'], test_size=0.2, random_state=42)

# # 3.1 Logistic Regressor

In [None]:
clfwv = LogisticRegression(solver='lbfgs', max_iter=1000)
clfwv.fit(X_train, y_train)

In [None]:
clfwv.score(X_test, y_test)

Utilisation d'un solver liblinear pour ajuster le problème d'optimisation

In [None]:
clfwv_tuned = LogisticRegression(solver='liblinear', max_iter=2000)
clfwv_tuned.fit(X_train, y_train)

clfwv_tuned.score(X_test, y_test)

# # 3.2 Random Forest et tuning d'hyperparamètres

In [None]:
from sklearn.ensemble import RandomForestClassifier

clfRF = RandomForestClassifier(n_estimators=300, min_samples_split = 5, min_samples_leaf = 2)
clfRF.fit(X_train, y_train)
clfRF.score(X_test,y_test)

from sklearn.model_selection import RandomizedSearchCV


params = {'n_estimators' : [200,250,300],
          'min_samples_split': [2,3,5],
          'min_samples_leaf' : [1,2,4]
         }

clfRF_tuned = RandomForestClassifier()

clfRF_tuned = RandomizedSearchCV(estimator=clfRF_tuned,
                         param_distributions=params,
                         scoring='neg_mean_squared_error',
                         n_iter=25,
                         verbose=1)
clfRF_tuned.fit(X_train, y_train)
print("Best parameters for RandomForestClassifier model:", clfRF_tuned.best_params_)
print("Lowest RMSE for RandomForestClassifier model : ", (-clfRF_tuned.best_score_)**(1/2.0))

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best parameters for RandomForestClassifier model: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 2}
Lowest RMSE for RandomForestClassifier model :  0.4189272013130682

# 3.3 Support Vector classification

In [None]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC


clf_linear_svc = SVC()
clf_linear_svc.fit(X_train, y_train)
clf_linear_svc.score(X_test,y_test)

# 4 . **Sample submission**

In [None]:
path_test_data = r'../input/word2vec-nlp-tutorial/testData.tsv.zip'
df_test = pd.read_csv(path_test_data, delimiter = '\t')

In [None]:
df_test.head()

In [None]:
# Cleaning text 

df_test['review'] = df_test['review'].apply(remove_balise_html)
df_test['review'] = df_test['review'].apply(lambda row : remove_stopwords(row))
df_test['review'] = df_test['review'].apply(word_tokenize)
df_test['review'] = df_test['review'].apply(lambda row : clean_txt(row))
df_test['review'] = df_test['review'].apply(lambda row : list(filter(None,row))) #retrait des éléments vides
df_test['review'] = df_test['review'].apply(lambda row : ' '.join([lemmatizer.lemmatize(w) for w in row]))
df_test['review'] = df_test['review'].apply(lambda row : row.split())

In [None]:
df_test['review'].head()

In [None]:
new_x_test =  word2vec_features(df_test["review"], model)
new_y_pred = clf_linear_svc.predict(new_x_test)
new_y_pred

In [None]:
output = pd.DataFrame(data={'id':df_test['id'], 'sentiment':new_y_pred})

In [None]:
output.head()

In [None]:
output.to_csv('submission.csv',index = False)