# Data Importation

In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv('./data/training_data.csv')
df_pred = pd.read_csv('./data/unlabelled_test_data.csv')

In [4]:
df.head()

Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1


In [5]:
df_pred.head()
X_pred = df_pred['sentence']
X_pred

0       Nous dûmes nous excuser des propos que nous eû...
1       Vous ne pouvez pas savoir le plaisir que j'ai ...
2       Et, paradoxalement, boire froid n'est pas la b...
3       Ce n'est pas étonnant, car c'est une saison my...
4       Le corps de Golo lui-même, d'une essence aussi...
                              ...                        
1195    C'est un phénomène qui trouve une accélération...
1196    Je vais parler au serveur et voir si on peut d...
1197    Il n'était pas comme tant de gens qui par pare...
1198        Ils deviennent dangereux pour notre économie.
1199    Son succès a généré beaucoup de réactions néga...
Name: sentence, Length: 1200, dtype: object

In [None]:
np.random.seed = 0

In [6]:
y = df['difficulty']
X = df['sentence']

# Baseline

In [7]:
base_rate = (df['difficulty'].value_counts().max()/df['difficulty'].shape[0]).round(4)
base_rate

0.1694

# Logistic Regression (whithout data cleaning)


In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score


In [12]:
# Using default tokenizer in TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1, 1))


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Define classifier
classifier = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=1000, random_state=0)


# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
pipe = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])

# Fit model on training set
pipe.fit(X_train, y_train)

KeyboardInterrupt: ignored

In [None]:
################################
#WHICH AVERAGE MODE IS BETTER ?#
################################

# Evaluate the model
def evaluate(true, pred):
    precision = precision_score(true, pred,average='macro')
    recall = recall_score(true, pred,average='macro')
    f1 = f1_score(true, pred,average='macro')
    print(f"CONFUSION MATRIX:\n{confusion_matrix(true, pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(true, pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n\tPrecision: {precision:.4f}\n\tRecall: {recall:.4f}\n\tF1_Score: {f1:.4f}")

In [None]:
# Predictions
y_pred = pipe.predict(X_test)

# Evaluation - test set
evaluate(y_test, y_pred)

CONFUSION MATRIX:
[[93 31 21 10  4  2]
 [54 60 30  6  6  8]
 [12 38 64 17  9 20]
 [ 6  6 15 66 27 24]
 [ 4  4 10 37 73 45]
 [ 7  8  8 19 24 92]]
ACCURACY SCORE:
0.4667
CLASSIFICATION REPORT:
	Precision: 0.4645
	Recall: 0.4677
	F1_Score: 0.4640


In [None]:
predictions = pipe.predict(X_pred)

predictions = pd.DataFrame(predictions,columns=['difficulty'])

predictions.to_csv("LogisticRegression.csv")

# KNN (whithout data cleaning)

In [None]:
# Import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

# Define classifier
classifier_knn = KNeighborsClassifier()

# Create pipeline
pipe_knn = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier_knn)])

# Fit model on training set
pipe_knn.fit(X_train, y_train)

# Predictions
y_pred_knn = pipe_knn.predict(X_test)

evaluate(y_test, y_pred_knn)

CONFUSION MATRIX:
[[121  28   8   1   1   2]
 [ 98  51  12   1   1   1]
 [ 81  39  33   3   1   3]
 [ 49  30  19  29   3  14]
 [ 48  36  29  15  29  16]
 [ 37  29  17  23   9  43]]
ACCURACY SCORE:
0.3187
CLASSIFICATION REPORT:
	Precision: 0.4007
	Recall: 0.3183
	F1_Score: 0.3022


In [None]:
predictions_knn = pipe_knn.predict(X_pred)

predictions_knn = pd.DataFrame(predictions_knn,columns=['difficulty'])

predictions_knn.to_csv("KNN.csv")

## KNN improved

In [None]:
# Grid Search - hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# Define parameters to test
grid = {'n_neighbors':np.arange(1,100),
        'p':np.arange(1,3),
        'weights':['uniform','distance']}

# Define and fit model
knn = KNeighborsClassifier()
classifier_knn_plus = GridSearchCV(knn, grid, cv=10)

pipe_knn_plus = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier_knn_plus)])

pipe_knn_plus.fit(X_train, y_train)

print("Hyperparameters:", classifier_knn_plus.best_params_)

Hyperparameters: {'n_neighbors': 4, 'p': 2, 'weights': 'distance'}


In [None]:
# Predictions
y_pred_knn_plus = pipe_knn_plus.predict(X_test)

evaluate(y_test, y_pred_knn_plus)


CONFUSION MATRIX:
[[116  27  14   1   1   2]
 [ 79  64  15   4   1   1]
 [ 65  38  44   8   2   3]
 [ 37  25  23  40   2  17]
 [ 37  30  23  26  36  21]
 [ 34  22  19  13  17  53]]
ACCURACY SCORE:
0.3677
CLASSIFICATION REPORT:
	Precision: 0.4227
	Recall: 0.3678
	F1_Score: 0.3575


In [None]:
predictions_knn_plus = pipe_knn_plus.predict(X_pred)

predictions_knn_plus = pd.DataFrame(predictions_knn_plus,columns=['difficulty'])

predictions_knn_plus.to_csv("Knn_plus.csv")

# Decision Tree Classifier (without data cleaning)

In [None]:
# Import Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

# Define classifier
classifier_dtc = DecisionTreeClassifier()

# Create pipeline
pipe_dtc = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier_dtc)])

# Fit model on training set
pipe_dtc.fit(X_train, y_train)

# Predictions
y_pred_dtc = pipe_dtc.predict(X_test)

evaluate(y_test, y_pred_dtc)

CONFUSION MATRIX:
[[82 35 25  8  2  9]
 [44 54 39 15  4  8]
 [28 38 38 23 16 17]
 [ 7 20 27 40 28 22]
 [10 19 33 36 40 35]
 [12 12 28 38 32 36]]
ACCURACY SCORE:
0.3021
CLASSIFICATION REPORT:
	Precision: 0.3021
	Recall: 0.3022
	F1_Score: 0.2994


In [None]:
predictions_dtc = pipe_dtc.predict(X_pred)

predictions_dtc = pd.DataFrame(predictions_dtc,columns=['difficulty'])

predictions_dtc.to_csv("DecisionTreeClassifier.csv")

# Random Forest Classifier (without data cleaning)

In [None]:
# Use random forest
from sklearn.ensemble import RandomForestClassifier

# Define classifier
classifier_rfc = RandomForestClassifier()

# Create pipeline
pipe_rfc = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier_rfc)])

# Fit model on training set
pipe_rfc.fit(X_train, y_train)

# Predictions
y_pred_rfc = pipe_rfc.predict(X_test)

evaluate(y_test, y_pred_rfc)

CONFUSION MATRIX:
[[121  22  11   4   2   1]
 [ 82  51  18  11   2   0]
 [ 35  39  54  18   7   7]
 [ 18  11  13  65  24  13]
 [ 12  10  29  52  47  23]
 [ 15  10  17  32  23  61]]
ACCURACY SCORE:
0.4156
CLASSIFICATION REPORT:
	Precision: 0.4250
	Recall: 0.4182
	F1_Score: 0.4059


In [None]:
predictions_rfc = pipe_rfc.predict(X_pred)

predictions_rfc = pd.DataFrame(predictions_rfc,columns=['difficulty'])

predictions_rfc.to_csv("RandomForestClassifier.csv")

# Remove stopwords



In [None]:
# Install and update spaCy
!pip install -U spacy

# Download the French language model
!python -m spacy download fr

!python -m spacy download fr_core_news_sm

!pip install git+https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git &> /dev/null


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2022-12-05 16:09:13.532373: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'fr' are deprecated. Please use the
full pipeline package name 'fr_core_news_sm' instead.[0m
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fr-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.4.0/fr_core_news_sm-3.4.0-py3-none-any.whl (16.3 MB)
[K     |████████████████████████████████| 16.3 MB 165 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
2022-12-05 16:09:24.133914: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable devi

In [None]:
# Import required packages
import spacy
from spacy import displacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import nltk
import string
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

nlp = spacy.load('fr_core_news_sm')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#Convert to lowercase
df['sentence'] = df['sentence'].str.lower()

# Define the function to remove the punctuation
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, " ")
    return text

# Remove punctuation
df['sentence'] = df['sentence'].apply(remove_punctuations)
df.head()

Unnamed: 0,id,sentence,difficulty
0,0,les coûts kilométriques réels peuvent diverger...,C1
1,1,le bleu c est ma couleur préférée mais je n a...,A1
2,2,le test de niveau en français est sur le site ...,A1
3,3,est ce que ton mari est aussi de boston,A1
4,4,dans les écoles de commerce dans les couloirs...,B1


In [None]:
#Define list of stopwords

french_stopwords = spacy.lang.fr.stop_words.STOP_WORDS

In [None]:
#Remove stopwords
df['sentence'] = df['sentence'].apply(lambda x: ' '.join([word for word in x.split() if word not in (french_stopwords)]))
df.head(10)

Unnamed: 0,id,sentence,difficulty
0,0,coûts kilométriques réels diverger sensiblemen...,C1
1,1,bleu c couleur préférée n aime vert,A1
2,2,test niveau français site internet l école,A1
3,3,mari boston,A1
4,4,écoles commerce couloirs places financières ar...,B1
5,5,histoire j beaucoup aimée,A2
6,6,médecins disent qu boire verre vin rouge repas,A2
7,7,particulièrement observé personnes besoin popu...,B2
8,8,j retrouvé plaisir manger oeuf coque,A2
9,9,bien habitons petite maison ancienne beau jardin,B1


In [None]:
#tokenize
nlp = spacy.load('fr_core_news_sm')

df['tocken_without_stopwords'] = [nlp(text) for text in df.sentence]
df.head()

Unnamed: 0,id,sentence,difficulty,tocken_without_stopwords,len
0,0,coûts kilométriques réels diverger sensiblemen...,C1,"(coûts, kilométriques, réels, diverger, sensib...",23
1,1,bleu c couleur préférée n aime vert,A1,"(bleu, c, couleur, préférée, n, aime, vert)",7
2,2,test niveau français site internet l école,A1,"(test, niveau, français, site, internet, l, éc...",7
3,3,mari boston,A1,"(mari, boston)",2
4,4,écoles commerce couloirs places financières ar...,B1,"(écoles, commerce, couloirs, places, financièr...",22


In [None]:
#add a columns with the lengh 
df['len'] = [len(token) for token in df.tocken_without_stopwords]
df.head()

Unnamed: 0,id,sentence,difficulty,tocken_without_stopwords,len
0,0,coûts kilométriques réels diverger sensiblemen...,C1,"(coûts, kilométriques, réels, diverger, sensib...",23
1,1,bleu c couleur préférée n aime vert,A1,"(bleu, c, couleur, préférée, n, aime, vert)",7
2,2,test niveau français site internet l école,A1,"(test, niveau, français, site, internet, l, éc...",7
3,3,mari boston,A1,"(mari, boston)",2
4,4,écoles commerce couloirs places financières ar...,B1,"(écoles, commerce, couloirs, places, financièr...",22


#Stemming

In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language='french')

def return_stem(sentence):
    doc = nlp(sentence)
    return [stemmer.stem(X.text) for X in doc]

In [None]:
df['stemmed'] = [return_stem(text) for text in df.sentence]


In [None]:
df.head()

Unnamed: 0,id,sentence,difficulty,tocken_without_stopwords,len,stemmed
0,0,coûts kilométriques réels diverger sensiblemen...,C1,"(coûts, kilométriques, réels, diverger, sensib...",23,"[coût, kilometr, réel, diverg, sensibl, valeur..."
1,1,bleu c couleur préférée n aime vert,A1,"(bleu, c, couleur, préférée, n, aime, vert)",7,"[bleu, c, couleur, préfer, n, aim, vert]"
2,2,test niveau français site internet l école,A1,"(test, niveau, français, site, internet, l, éc...",7,"[test, niveau, franc, sit, internet, l, écol]"
3,3,mari boston,A1,"(mari, boston)",2,"[mar, boston]"
4,4,écoles commerce couloirs places financières ar...,B1,"(écoles, commerce, couloirs, places, financièr...",22,"[écol, commerc, couloir, plac, financi, arriv,..."


#Ensemble TEST

In [8]:
#https://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble
#https://towardsdatascience.com/ensemble-methods-or-democracy-for-ai-bac2fa129f61

#https://github.com/crownpku/text2vec

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [14]:
# Define classifier
classifier = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=1000, random_state=0)


# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
pipe_LRCV1 = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)

In [16]:
# Define classifier
classifier = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=1000, random_state=43)


# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
pipe_LRCV2 = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=12)

In [18]:
# Define classifier
classifier = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=1000, random_state=12)


# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
pipe_LRCV3 = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=33)

In [20]:
# Define classifier
classifier = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=1000, random_state=33)


# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
pipe_LRCV4 = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=20)

In [22]:
# Define classifier
classifier = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=1000, random_state=20)


# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
pipe_LRCV5 = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])

In [23]:
from sklearn.ensemble import VotingClassifier

In [None]:
soft_voting_clf = VotingClassifier(estimators=[('LCRV1', pipe_LRCV1), ('LCRV2', pipe_LRCV2), ('LCRV3', pipe_LRCV3), ('LCRV4', pipe_LRCV4), ('LCRV5', pipe_LRCV5)], voting='soft')

In [None]:
soft_voting_clf.fit(X_train, y_train) # training

In [26]:
predictions = soft_voting_clf.predict(X_pred)

predictions = pd.DataFrame(predictions,columns=['difficulty'])

predictions.to_csv("ensembleLRCV5x5v2.csv")