In [54]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd  
import numpy as np
import pickle
import codecs
import spacy
import glob
import re

all_files = glob.glob("/corpus/groupe*.txt")
df = pd.DataFrame()
for file_ in all_files: 
    list_of_lists = []
    with open(file_, 'rU') as f:
        for line in f:
            tweet = ' '.join(line.split(")")[1:])
            id = line.split(")")[0].split(",")[0][1:]
            sentiment = line.split(")")[0].split(",")[1]
            try:
                group = line.split(")")[0].split(",")[2]
            except:
                group = ''
            list_of_lists.append([id, sentiment, group, tweet])
    next_df = pd.DataFrame(list_of_lists, columns=['Id','Sentiment', 'Group', 'Tweet'])
    df = df.append(next_df)
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4998 entries, 0 to 4997
Data columns (total 4 columns):
Id           4998 non-null object
Sentiment    4998 non-null object
Group        4998 non-null object
Tweet        4998 non-null object
dtypes: object(4)
memory usage: 156.3+ KB




In [55]:
print(df.Sentiment.value_counts())

neg    2507
neu    1744
irr    385 
pos    362 
Name: Sentiment, dtype: int64


In [56]:
contraction_mapping = {
    "gÃ´che": "gauche",
    "gauchos": "gauche",
    "rÃ©ac": "rÃ©action",
    "co": "companie",
    "cie": "companie",
    "and": "et",
    "but": "mais",
    "ui": "oui",
    "@macron": "Macron",
    "@EmmanuelMacron": "Emmanuel Macron",
    "GJ": "gilet jaune",
    " 1 ": " un ",
    "PR": "prÃ©sident",
    "RT": "retweet",
    "LREM": "la rÃ©publique en marche",
    "LAREM": "la rÃ©publique en marche",
    "#LREM": "la rÃ©publique en marche",
    "#LAREM": "la rÃ©publique en marche",
    "telma": "tellement",
    "ctr": "contre",
    "5e": "cinquieme",
    "foute": "foutre",
    "my god": "mon dieu",
    "nn": "non",
    "b8en": "bien",
    "good": "bien",
    "bad": "mauvais",
    "lui": "il"
}

In [57]:
nlp = spacy.load('fr')

def tweet_cleaner(text):
    decoded = str(text)
    apostrophe_handled = re.sub("â€™", "'", decoded)
    expanded = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in apostrophe_handled.split(" ")])
    parsed = nlp(expanded)
    final_tokens = []
    for t in parsed:
        if t.is_punct or t.is_space or t.like_num or t.like_url or str(t).startswith('@'):
            pass
        else:
            if t.lemma_ == '-PRON-':
                final_tokens.append(str(t))
            else:
                sc_removed = re.sub("[^a-zA-ZÃ©Ã¨ÃªÃ¹Ã»Ã Ã¢Å“Ã§Ã®]", '', str(t.lemma_))
                if len(sc_removed) > 1:
                    final_tokens.append(sc_removed)
    joined = ' '.join(final_tokens)
    spell_corrected = re.sub(r'(.)\1+', r'\1\1', joined)
    return spell_corrected

In [58]:
pd.set_option('display.max_colwidth', -1)
print(df.Tweet[131:135])

131     ðŸ˜‚ ðŸ˜‚ ðŸ˜‚ #Macron n'a peur de rien Il vit dans son monde... Le dialogue est inutile... https://t.co/mT9GBDM59D\n                                                                                                                          
132     macron ne reconnait pas une personne comme un Ãªtre humain Ã  qui l'on doit le respect jusque dans la mort assassinÃ©e par sa police https://t.co/o7vKzXWQxe\n                                                                           
133     @PublicsenatPro @publicsenat @MartinGenier @fitouss J'ai vu Ã  la tÃ©lÃ© le Roi Jupiter #Macron houlala il est bien en campagne pour les EuropÃ©en ðŸ¤¨ https://t.co/QLpa4BEj8x\n                                                            
134     #algÃ©rie ILS SONT INDEPENDANTS NOUS N AVONS PAS A INTERVENIR CONTRAIREMENT A CE QU A FAIT #macron AU VENEZUELA AUX ALGERIENS DE GERER A Paris, des AlgÃ©riens partagÃ©s sur un rÃ´le de la France dans la crise https://t.co/hcCGz4tfdG\n
Name: Tweet, dtype: o

In [59]:
print([tweet_cleaner(t) for t in df.Tweet[131:135]])

['Macron ne avoir peur de rien il voir dan son monde le dialogue Ãªtre inutile', 'macron ne reconner pas un personne comme Ãªtre humain qui le on devoir le respect jusqu dan le mort assassiner par son police', 'il avoir voir le tÃ©lÃ© le Roi Jupiter Macron houlala il Ãªtre bien en campagne pour le europÃ©en', 'algÃ©rie il Ãªtre independant nous ne avoir pas INTERVENIR contrairement ce que avoir faire macron au venezuela AUX algerien DE gerer avoir Paris un algÃ©rien partagÃ© sur rle de le France dan le crise']


In [60]:
df['Clean_tweet'] = [tweet_cleaner(t) for t in df.Tweet]

In [61]:
df.head()

Unnamed: 0,Id,Sentiment,Group,Tweet,Clean_tweet
0,1102616518876168193,neu,Groupe9,"Emmanuel #Macron a dÃ©cidÃ© de prendre une sorte de leadership europÃ©en. Il lui reste maintenant Ã le dÃ©montrer et c'est loin d'Ãªtre gagnÃ© quand on voit les revers qu'a pu essuyer la #France cette semaine, je pense d'abord Ã #AirFranceKLM. @JeudyBruno #Europe #cdanslair https://t.co/9LhQ9XNOi3\n",Emmanuel Macron avoir dÃ©cider de prendre un sorte de leadership europÃ©en il luire rester maintenant le dÃ©montrer et ce Ãªtre loin de Ãªtre gagner quand on voir le revers que avoir pouvoir essuyer le France ce semaine il penser de abord airfranceklm Europe cdanslair
1,1102616470125793281,neu,Groupe9,Grand dÃ©bat national : Emmanuel Macron se rendra dans les Alpes-de-Haute-Provence jeudi https://t.co/WMuciXR82L\n,grand dÃ©bat national Emmanuel Macron se rendre dan le alpesdehauteprovence jeudi
2,1102616467764260864,neu,Groupe9,Emmanuel Macron lance une Â«Â acadÃ©mieÂ Â» europÃ©enneÂ du renseignement https://t.co/RjJRsvO700 https://t.co/eF2W3epcWB\n,Emmanuel Macron lancer un acadÃ©mie europÃ©en du renseignement
3,1102616455877668864,neg,Groupe9,"@BrunoPaumard ben si tes comptes 1 et 2 ont Ã©tÃ© bloquÃ©s c'est que t'as fait le con ,ou alors tu sais pas compter jusqu'Ã 1 ! bon laisse tomber ton gilet jaune vomis et convertis toi vers Macron ton prÃ©sident tu feras plus ta jaunisse tu te sentiras mieux dans ton froc ! https://t.co/O8zPqKryAj\n",ben si compte et avoir Ãªtre bloquer ce Ãªtre que avoir faire le con ou alors tu savoir pas compter jusque bon laisser tomber ton gilet jaune vomir et convertir lui ver Macron ton prÃ©sident tu faire plus ton jaunisse tu se sentira mieux dan ton froc
4,1102616439222079490,neu,Groupe9,Emmanuel Macron lance une Â«Â acadÃ©mieÂ Â» europÃ©enneÂ du renseignement https://t.co/P5V8eiFJJV\n,Emmanuel Macron lancer un acadÃ©mie europÃ©en du renseignement


In [62]:
#Get stopwords from file
with open('./stop_words_fr.txt') as f:
    stopwords = f.read().splitlines()

In [63]:
X = df.Clean_tweet.values
y = df.Sentiment.values

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

# split train & test
training_X, test_X, training_y, test_y = train_test_split(X, y, test_size = .3, random_state=42)

sm = SMOTE(random_state=42)
X_train_s, y_train_s = sm.fit_sample(training_X, training_y)

X_train, X_test, y_train, y_test = train_test_split(X_train_s, y_train_s, test_size = .3, random_state=42)

# model tuning & validation
cv = StratifiedKFold()




In [64]:
rf_grid = {  
     #'max_depth': list(range(10,100,10)),
     #'max_features': ['auto', 'sqrt'],
     #'min_samples_leaf': [1, 2, 4],
     #'min_samples_split': [2, 5, 10],
     'n_estimators': list(range(100,500,100))
}
                  
rf_clf = RandomForestClassifier(n_estimators=35, max_depth=8, min_samples_split=4,n_jobs=-1)

best_rf_clf = GridSearchCV(rf_clf, rf_grid, verbose=2,cv=cv)
best_rf_clf.fit(X_train, y_train)
rf_preds = best_rf_clf.predict(test_X)

print("Random Forest")
print(best_rf_clf.best_params_)
print(classification_report(test_y, rf_preds))
print("Accuracy", accuracy_score(test_y, rf_preds))

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] n_estimators=100 ................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................. n_estimators=100, total=   1.7s
[CV] n_estimators=100 ................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s remaining:    0.0s


[CV] ................................. n_estimators=100, total=   0.3s
[CV] n_estimators=100 ................................................
[CV] ................................. n_estimators=100, total=   0.3s
[CV] n_estimators=200 ................................................
[CV] ................................. n_estimators=200, total=   0.6s
[CV] n_estimators=200 ................................................
[CV] ................................. n_estimators=200, total=   0.5s
[CV] n_estimators=200 ................................................
[CV] ................................. n_estimators=200, total=   0.5s
[CV] n_estimators=300 ................................................
[CV] ................................. n_estimators=300, total=   0.6s
[CV] n_estimators=300 ................................................
[CV] ................................. n_estimators=300, total=   0.6s
[CV] n_estimators=300 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:   10.8s finished


Random Forest
{'n_estimators': 300}
              precision    recall  f1-score   support

         irr       0.17      0.16      0.17       111
         neg       0.65      0.81      0.72       764
         neu       0.72      0.45      0.55       526
         pos       0.33      0.36      0.34        99

   micro avg       0.61      0.61      0.61      1500
   macro avg       0.47      0.45      0.45      1500
weighted avg       0.62      0.61      0.60      1500

Accuracy 0.6066666666666667


In [65]:
lr_grid = {
    "C": np.logspace(-4,4,20), 
    "penalty": ["l1","l2"] # l1 lasso l2 ridge
}

lr_clf = LogisticRegression()

best_lr_clf = GridSearchCV(lr_clf, lr_grid, verbose=2,cv=cv)
best_lr_clf.fit(X_train, y_train)
lr_preds = best_lr_clf.predict(test_X)

print("Logistic Regression")
print(best_lr_clf.best_params_)
print(classification_report(test_y, lr_preds))
print("Accuracy", accuracy_score(test_y, lr_preds))

Fitting 3 folds for each of 40 candidates, totalling 120 fits
[CV] C=0.0001, penalty=l1 ............................................
[CV] ............................. C=0.0001, penalty=l1, total=   0.0s
[CV] C=0.0001, penalty=l1 ............................................
[CV] ............................. C=0.0001, penalty=l1, total=   0.0s
[CV] C=0.0001, penalty=l1 ............................................
[CV] ............................. C=0.0001, penalty=l1, total=   0.0s
[CV] C=0.0001, penalty=l2 ............................................
[CV] ............................. C=0.0001, penalty=l2, total=   0.0s
[CV] C=0.0001, penalty=l2 ............................................
[CV] ............................. C=0.0001, penalty=l2, total=   0.0s
[CV] C=0.0001, penalty=l2 ............................................
[CV] ............................. C=0.0001, penalty=l2, total=   0.0s
[CV] C=0.00026366508987303583, penalty=l1 ............................
[CV] ..........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ............. C=0.00026366508987303583, penalty=l2, total=   0.0s
[CV] C=0.00026366508987303583, penalty=l2 ............................
[CV] ............. C=0.00026366508987303583, penalty=l2, total=   0.0s
[CV] C=0.00026366508987303583, penalty=l2 ............................
[CV] ............. C=0.00026366508987303583, penalty=l2, total=   0.0s
[CV] C=0.0006951927961775605, penalty=l1 .............................
[CV] .............. C=0.0006951927961775605, penalty=l1, total=   0.0s
[CV] C=0.0006951927961775605, penalty=l1 .............................
[CV] .............. C=0.0006951927961775605, penalty=l1, total=   0.0s
[CV] C=0.0006951927961775605, penalty=l1 .............................
[CV] .............. C=0.0006951927961775605, penalty=l1, total=   0.0s
[CV] C=0.0006951927961775605, penalty=l2 .............................
[CV] .............. C=0.0006951927961775605, penalty=l2, total=   0.0s
[CV] C=0.0006951927961775605, penalty=l2 .............................
[CV] .

[CV] .................. C=4.281332398719396, penalty=l1, total=   0.2s
[CV] C=4.281332398719396, penalty=l1 .................................
[CV] .................. C=4.281332398719396, penalty=l1, total=   0.2s
[CV] C=4.281332398719396, penalty=l2 .................................
[CV] .................. C=4.281332398719396, penalty=l2, total=   0.1s
[CV] C=4.281332398719396, penalty=l2 .................................
[CV] .................. C=4.281332398719396, penalty=l2, total=   0.1s
[CV] C=4.281332398719396, penalty=l2 .................................
[CV] .................. C=4.281332398719396, penalty=l2, total=   0.1s
[CV] C=11.288378916846883, penalty=l1 ................................
[CV] ................. C=11.288378916846883, penalty=l1, total=   0.2s
[CV] C=11.288378916846883, penalty=l1 ................................
[CV] ................. C=11.288378916846883, penalty=l1, total=   0.3s
[CV] C=11.288378916846883, penalty=l1 ................................
[CV] .

[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   18.5s finished


Logistic Regression
{'C': 29.763514416313132, 'penalty': 'l2'}
              precision    recall  f1-score   support

         irr       0.23      0.23      0.23       111
         neg       0.70      0.69      0.70       764
         neu       0.60      0.59      0.60       526
         pos       0.28      0.33      0.30        99

   micro avg       0.60      0.60      0.60      1500
   macro avg       0.45      0.46      0.46      1500
weighted avg       0.60      0.60      0.60      1500

Accuracy 0.5993333333333334


In [66]:
# Save rf classifier to a file
save_classifier = open("rf_classifier.pickle", 'wb')
pickle.dump(best_rf_clf, save_classifier)
save_classifier.close()

# Save lr classifier to a file
save_classifier = open("lr_classifier.pickle", 'wb')
pickle.dump(best_lr_clf, save_classifier)
save_classifier.close()

In [67]:
# Retrieve the saved file and uplaod it to an object
vec = open("rf_classifier.pickle", 'rb')
rf_clf = pickle.load(vec)
vec.close()

# Retrieve the saved file and uplaod it to an object
vec = open("lr_classifier.pickle", 'rb')
lr_clf = pickle.load(vec)
vec.close()

In [68]:
#Example of oversamling with SMOTE

sent1 = "emmanuel macron etre bon"
sent2 = "macron etre un monstre"
sent3 = "macron sale merde"
sent4 = "comme il etre mauvais ce macron"
sent5 = "il avoir bien aimer macron"

testing_text = pd.Series([sent1, sent2, sent3, sent4, sent5])
testing_target = pd.Series([1,0,0,0,1])

tv = TfidfVectorizer(stop_words=None, max_features=100000)
testing_tfidf = tv.fit_transform(testing_text)

smt = SMOTE(random_state=777, k_neighbors=1)
X_SMOTE, y_SMOTE = smt.fit_sample(testing_tfidf, testing_target)
df_topredict = pd.DataFrame(X_SMOTE.todense(), columns=tv.get_feature_names())
df_topredict

Unnamed: 0,aimer,avoir,bien,bon,ce,comme,emmanuel,etre,il,macron,mauvais,merde,monstre,sale,un
0,0.0,0.0,0.0,0.611353,0.0,0.0,0.611353,0.40943,0.0,0.291313,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.40943,0.0,0.291313,0.0,0.0,0.611353,0.0,0.611353
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.319302,0.0,0.670092,0.0,0.670092,0.0
3,0.0,0.0,0.0,0.0,0.480764,0.480764,0.0,0.321974,0.387878,0.229087,0.480764,0.0,0.0,0.0,0.0
4,0.507806,0.507806,0.507806,0.0,0.0,0.0,0.0,0.0,0.409694,0.241972,0.0,0.0,0.0,0.0,0.0
5,0.179535,0.179535,0.179535,0.395209,0.0,0.0,0.395209,0.264676,0.144848,0.273868,0.0,0.0,0.0,0.0,0.0


In [69]:
print(y_SMOTE)

[1 0 0 0 1 1]


In [70]:
pd.DataFrame(test_X.todense(), columns=vectorizer.get_feature_names()).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Columns: 10332 entries, aah to Å“uvre
dtypes: float64(10332)
memory usage: 118.2 MB


In [71]:
container_df = pd.DataFrame(0.0, index=np.arange(len(X_SMOTE.todense())), columns=vectorizer.get_feature_names())

In [72]:
for column in df_topredict:
    if column in vectorizer.get_feature_names():
        container_df[column] = df_topredict[column]

In [73]:
preds = best_lr_clf.predict(container_df)
preds

array(['neu', 'neg', 'neu', 'pos', 'neu', 'neu'], dtype=object)

In [74]:
preds = best_rf_clf.predict(container_df)
preds

array(['neu', 'neu', 'neg', 'neg', 'neg', 'neu'], dtype=object)