# Reconnaissance du locuteur (Chirac/Mitterrand)

### SOYKOK Aylin 28711545 - CELIK Simay 28713301 </b>

## Meilleurs paramètres

<b>
Ce notebook est créé afin de : <br>
    -tester quelle moyenne de BoW est plus exacte selon les paramètres
</b>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import codecs
import re
import os.path
import string
import nltk
from nltk.stem.snowball import FrenchStemmer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from collections import Counter
from nltk.corpus import stopwords
from utils_donnee import *
from evaluation import *

# Chargement des données
fname = "./datasets/AFDpresidentutf8/corpus.tache1.learn.utf8"
alltxts,alllabs = load_pres(fname)

### Test des parametres pour TfidfVectorizer

test avec suppression de la ponctuation, des accents et des chiffres, transformation des mots entièrement en majuscule en marqueurs spécifiques, supression des balises

In [3]:
preprocessor = lambda text: remove_tags(transform_uppercase(accent_suppression(chiffre_suppression(ponc_suppression(text)))))
model_params = {'C': 100.0, 'solver': 'liblinear'}

#### sublinear_tf=True ou False

In [25]:
vect_params = {'sublinear_tf': True}
res1 = eval_test(preprocessor=preprocessor, vectorizer=TfidfVectorizer, vect_params=vect_params,
model=LogisticRegression, model_params=model_params, over_sample=True)

Accuracy: 0.8652
F1 Score: 0.9211
Precision: 0.9368
ROC AUC sur Mitterrand (minoritaire): 0.1464
-----Metrics du serveur--------
F1 Score sur Mitterrand (minoritaire): 0.5362
ROC AUC sur Chirac: 0.8536
AP sur Mitterrand (minoritaire): 0.7306


In [26]:
vect_params = {'sublinear_tf': False}
res2 = eval_test(preprocessor=preprocessor, vectorizer=TfidfVectorizer, vect_params=vect_params,
model=LogisticRegression, model_params=model_params, over_sample=True)

Accuracy: 0.8645
F1 Score: 0.9207
Precision: 0.9364
ROC AUC sur Mitterrand (minoritaire): 0.1467
-----Metrics du serveur--------
F1 Score sur Mitterrand (minoritaire): 0.5339
ROC AUC sur Chirac: 0.8533
AP sur Mitterrand (minoritaire): 0.7307


In [27]:
print(f'Nb de fois res1 était plus accurate que res2 : {accuracy_difference(res1,res2)}')

Nb de fois res1 était plus accurate que res2 : ([1, 1, 1, 1, 2], 0.8)


Sublinear_tf est utile sauf pour le score de AP sur Mitterrand.

#### ngram_range = (1,2)

In [40]:
vect_params = {'ngram_range': (1,2)}
res3 = eval_test(preprocessor=preprocessor, vectorizer=TfidfVectorizer, vect_params=vect_params,
model=LogisticRegression, model_params=model_params, over_sample=True)

Accuracy: 0.9105
F1 Score: 0.9488
Precision: 0.9428
ROC AUC sur Mitterrand (minoritaire): 0.0967
-----Metrics du serveur--------
F1 Score sur Mitterrand (minoritaire): 0.6433
ROC AUC sur Chirac: 0.9033
AP sur Mitterrand (minoritaire): 0.7157


En général, augmente F1 Score sur Mitterrand mais diminue ROC AUC et AP sur Mitterrand.

#### ngram_range = (1,3)

In [41]:
vect_params = {'ngram_range': (1,3)}
res4 = eval_test(preprocessor=preprocessor, vectorizer=TfidfVectorizer, vect_params=vect_params,
model=LogisticRegression, model_params=model_params, over_sample=True)

Accuracy: 0.9081
F1 Score: 0.9473
Precision: 0.9446
ROC AUC sur Mitterrand (minoritaire): 0.0920
-----Metrics du serveur--------
F1 Score sur Mitterrand (minoritaire): 0.6427
ROC AUC sur Chirac: 0.9080
AP sur Mitterrand (minoritaire): 0.7144


In [42]:
print(f'Nb de fois (1,3) était plus accurate que (1,2) : {accuracy_difference(res3,res4)}')

Nb de fois (1,3) était plus accurate que (1,2) : ([1, 1, 1, 2, 1], 0.8)


ngram_range = (1,3) est mieux que ngram_range = (1,2)

#### ngram_range = (1,4)

In [43]:
vect_params = {'ngram_range': (1,4)}
res5 = eval_test(preprocessor=preprocessor, vectorizer=TfidfVectorizer, vect_params=vect_params,
model=LogisticRegression, model_params=model_params, over_sample=True)

Accuracy: 0.9008
F1 Score: 0.9425
Precision: 0.9499
ROC AUC sur Mitterrand (minoritaire): 0.0940
-----Metrics du serveur--------
F1 Score sur Mitterrand (minoritaire): 0.6401
ROC AUC sur Chirac: 0.9060
AP sur Mitterrand (minoritaire): 0.7149


In [44]:
print(f'Nb de fois (1,3) était plus accurate que (1,4) : {accuracy_difference(res3,res5)}')

Nb de fois (1,3) était plus accurate que (1,4) : ([1, 1, 1, 2, 1], 0.8)


ngram_range = (1,3) est mieux que ngram_range = (1,4)

#### Test dans le serveur avec oversampling, ngram_range = (1,3) 

In [38]:
#suppression de la ponctuation et des chiffres, transformation des mots entièrement en majuscule en marqueurs spécifiques, 
# supression des balises
vect_params = {'ngram_range': (1,3)}
prediction_generator(preprocessor=preprocessor, vectorizer=TfidfVectorizer, vect_params=vect_params,
model=LogisticRegression, model_params=model_params, over_sample=True, save=True)

1 0.0002687374985183011


array([0.00026874, 0.07773948, 0.06311755, ..., 0.00605084, 0.0006643 ,
       0.12114972])

## Trouver les meilleures paramètres

In [9]:
#stop words preprocessed
french_stop_words = stopwords.words('french')
preprocessed_fr_stop_words = [unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8') for word in french_stop_words]

In [10]:
vect_params = {
        'stop_words': [stopwords.words('french'), None],
        'max_df': [0.5, 0.75, 1.0], 
        'min_df': [2, 3, 5], 
        'ngram_range': [(1, 3), (2, 3)], 
        'binary': [True, False],
        'use_idf': [True, False],
        'sublinear_tf': [True, False],
        'max_features': [None, 1000, 5000, 10000]
}

new_vect_params, new_best_score = find_best_params(preprocessor=preprocessor,vectorizer=TfidfVectorizer,vect_params=vect_params,
model=LogisticRegression,model_params=model_params)
print("vect params:",new_vect_params)
print("score:",new_best_score)

KeyboardInterrupt: 

In [6]:
# Régression Logistique
vect_params = {'stop_words': None, 'max_df': 0.5, 'min_df': 2, 'ngram_range': (1, 3), 'binary': True, 'use_idf': True, 'sublinear_tf': True, 'max_features': None}
preprocessor = lambda text: remove_tags(transform_uppercase(chiffre_suppression(ponc_suppression(text))))

# En maximisant f1 sur Mitterand
lr_params_f1, best_score_f1 = best_params_lr(preprocessor,vect_params,f1=True,auc=False)

Best Score:  0.6202060991099231
Best Logistic Regression Params:  {'lr__C': 10, 'lr__penalty': 'l2', 'lr__solver': 'liblinear'}


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Download

In [7]:
# En maximisant roc auc
best_score_auc, lr_params_auc = best_params_lr(preprocessor,vect_params,f1=False,auc=True)

Best Score:  0.8932702913044734
Best Logistic Regression Params:  {'lr__C': 10, 'lr__penalty': 'l2', 'lr__solver': 'liblinear'}


In [8]:
# Test avec les nouvelles paramètres pour le regression logistique et tf-idf
preprocessor = lambda text: remove_tags(transform_uppercase(chiffre_suppression(ponc_suppression(text))))
vect_params = {'stop_words': None, 'max_df': 0.5, 'min_df': 2, 'ngram_range': (1, 3), 'binary': True, 'use_idf': True, 'sublinear_tf': True, 'max_features': None}
model_params = {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
eval_test(preprocessor=preprocessor, vectorizer=TfidfVectorizer, vect_params=vect_params,
model=LogisticRegression, model_params=model_params, over_sample=True)

F1 Score sur Mitterrand (minoritaire): 0.6411103767349637
ROC AUC sur Chirac: 0.9063612039510178
AP sur Mitterrand (minoritaire): 0.7147973172985861


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Download

C'est presque la même que ce qu'on a eu avec le test qu'on a fait avant.

In [11]:
# Naive Bayes
vect_params = {'stop_words': None, 'max_df': 0.5, 'min_df': 2, 'ngram_range': (1, 3), 'binary': True, 'use_idf': True, 'sublinear_tf': True, 'max_features': None}

# En maximisant f1 sur Mitterand
nb_best_score_f1, nb_params_f1 = best_params_nb(preprocessor,vect_params,f1=True,auc=False)

KeyboardInterrupt: 

In [5]:
# En maximisant roc auc
nb_best_score_auc, nb_params_auc = best_params_nb(preprocessor,vect_params,f1=False,auc=True)

Best Score:  0.895120168369494
Best Naive Bayes Params:  {'mnb__alpha': 1.3, 'mnb__fit_prior': True}


In [6]:
# Test avec Naive Bayes
preprocessor = lambda text: remove_tags(transform_uppercase(chiffre_suppression(ponc_suppression(text))))
vect_params = {'stop_words': None, 'max_df': 0.5, 'min_df': 2, 'ngram_range': (1, 3), 'binary': True, 'use_idf': True, 'sublinear_tf': True, 'max_features': None}
model_params = {'alpha': 0.5, 'fit_prior': True}
eval_test(preprocessor=preprocessor, vectorizer=TfidfVectorizer, vect_params=vect_params,
model=MultinomialNB, model_params=model_params, over_sample=True)

F1 Score sur Mitterrand (minoritaire): 0.577560975609756
ROC AUC sur Chirac: 0.908147126335746
AP sur Mitterrand (minoritaire): 0.7146147036339288


In [7]:
# Test 2 avec Naive Bayes
preprocessor = lambda text: remove_tags(transform_uppercase(chiffre_suppression(ponc_suppression(text))))
vect_params = {'stop_words': None, 'max_df': 0.5, 'min_df': 2, 'ngram_range': (1, 3), 'binary': True, 'use_idf': True, 'sublinear_tf': True, 'max_features': None}
model_params = {'alpha': 1.3, 'fit_prior': True}
eval_test(preprocessor=preprocessor, vectorizer=TfidfVectorizer, vect_params=vect_params,
model=MultinomialNB, model_params=model_params, over_sample=True)

F1 Score sur Mitterrand (minoritaire): 0.5466666666666666
ROC AUC sur Chirac: 0.909089798220537
AP sur Mitterrand (minoritaire): 0.7143688766452229


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aylinsoykok/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Download

Les résultats avec NB sont moins biens que la regression logistique.

In [5]:
# XGBoost
preprocessor = lambda text: remove_tags(transform_uppercase(chiffre_suppression(ponc_suppression(text))))
vect_params = {'stop_words': None, 'max_df': 0.5, 'min_df': 2, 'ngram_range': (1, 3), 'binary': True, 'use_idf': True, 'sublinear_tf': True, 'max_features': None}

# En maximisant f1 sur Mitterand
xg_best_score_f1, xg_params_f1 = best_params_xgb(preprocessor,vect_params,f1=True,auc=False)

Best Score:  0.48326117719998635
Best XGBoost Params:  {'xgb__subsample': 1.0, 'xgb__min_child_weight': 5, 'xgb__max_depth': 5, 'xgb__gamma': 1.5, 'xgb__colsample_bytree': 0.6}


In [6]:
# En maximisant roc auc
xg_best_score_auc, xg_params_auc = best_params_xgb(preprocessor,vect_params,f1=False,auc=True)

Best Score:  0.831587573751312
Best XGBoost Params:  {'xgb__subsample': 1.0, 'xgb__min_child_weight': 5, 'xgb__max_depth': 5, 'xgb__gamma': 1.5, 'xgb__colsample_bytree': 0.6}


In [4]:
# Test avec Xgboost
preprocessor = lambda text: remove_tags(transform_uppercase(chiffre_suppression(ponc_suppression(text))))
vect_params = {'stop_words': None, 'max_df': 0.5, 'min_df': 2, 'ngram_range': (1, 3), 'binary': True, 'use_idf': True, 'sublinear_tf': True, 'max_features': None}
model_params = {'subsample': 1.0, 'min_child_weight': 5, 'max_depth': 5, 'gamma': 1.5, 'colsample_bytree': 0.6}
eval_test(preprocessor=preprocessor, vectorizer=TfidfVectorizer, vect_params=vect_params,
model=xgb.XGBClassifier, model_params=model_params, over_sample=True)

F1 Score sur Mitterrand (minoritaire): 0.48168542015800814
ROC AUC sur Chirac: 0.8375408623223584
AP sur Mitterrand (minoritaire): 0.7362307998735431


In [9]:
# Test avec SVM
preprocessor = lambda text: remove_tags(transform_uppercase(chiffre_suppression(ponc_suppression(text))))
vect_params = {'stop_words': None, 'ngram_range': (1, 3), 'binary': True, 'use_idf': True, 'sublinear_tf': True, 'max_features': None}
model_params = {'random_state': 42}
eval_test(preprocessor=preprocessor, vectorizer=TfidfVectorizer, vect_params=vect_params,
model=LinearSVC, model_params=model_params, over_sample=True)



F1 Score sur Mitterrand (minoritaire): 0.6528169014084507


LinearSVC seems promising

In [None]:
# Trouver les meilleures paramètres pour le tf-idf avec SVM
vect_params = {
        'stop_words': [stopwords.words('french'), None],
        'max_df': [0.5, 0.75, 1.0], 
        'min_df': [1, 2, 3, 5], 
        'ngram_range': [(1, 3), (2, 3)], 
        'binary': [True, False],
        'use_idf': [True, False],
        'sublinear_tf': [True, False],
        'max_features': [None, 1000, 5000, 10000]
}

model_params_svm = {'random_state': 42, 'dual': True} # on met dual car sinon il y a une warning
preprocessor = lambda text: remove_tags(transform_uppercase(chiffre_suppression(ponc_suppression(text))))
new_vect_params_svm, new_best_score_svm = find_best_params(preprocessor=preprocessor,vectorizer=TfidfVectorizer,vect_params=vect_params,
model=LinearSVC,model_params=model_params_svm)
print("vect params avec svm:",new_vect_params_svm)
print("score:",new_best_score_svm)

 ### ---------------------