<a href="https://colab.research.google.com/github/stefarine/DMML2022_ROLEX/blob/main/code/UNIL_ROLEX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data Importation

In [48]:
import pandas as pd
import numpy as np

df = pd.read_csv('training_data.csv')
df_pred = pd.read_csv('unlabelled_test_data.csv')

In [49]:
df.head()

Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1


In [50]:
df_pred.head()
X_pred = df_pred['sentence']
X_pred

0       Nous dûmes nous excuser des propos que nous eû...
1       Vous ne pouvez pas savoir le plaisir que j'ai ...
2       Et, paradoxalement, boire froid n'est pas la b...
3       Ce n'est pas étonnant, car c'est une saison my...
4       Le corps de Golo lui-même, d'une essence aussi...
                              ...                        
1195    C'est un phénomène qui trouve une accélération...
1196    Je vais parler au serveur et voir si on peut d...
1197    Il n'était pas comme tant de gens qui par pare...
1198        Ils deviennent dangereux pour notre économie.
1199    Son succès a généré beaucoup de réactions néga...
Name: sentence, Length: 1200, dtype: object

In [51]:
np.random.seed = 0

In [52]:
y = df['difficulty']
X = df['sentence']

Baseline

In [53]:
base_rate = (df['difficulty'].value_counts().max()/df['difficulty'].shape[0]).round(4)
base_rate

0.1694

Logistic Regression (whithout data cleaning)

In [54]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score


In [55]:
# Using default tokenizer in TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1, 1))


In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [57]:
# Define classifier
classifier = LogisticRegression()

# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
pipe = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])

# Fit model on training set
pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('classifier', LogisticRegression())])

In [58]:
################################
#WHICH AVERAGE MODE IS BETTER ?#
################################

# Evaluate the model
def evaluate(true, pred):
    precision = precision_score(true, pred,average='macro')
    recall = recall_score(true, pred,average='macro')
    f1 = f1_score(true, pred,average='macro')
    print(f"CONFUSION MATRIX:\n{confusion_matrix(true, pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(true, pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n\tPrecision: {precision:.4f}\n\tRecall: {recall:.4f}\n\tF1_Score: {f1:.4f}")

In [59]:
# Predictions
y_pred = pipe.predict(X_test)

# Evaluation - test set
evaluate(y_test, y_pred)

CONFUSION MATRIX:
[[93 31 21 10  4  2]
 [54 60 30  6  6  8]
 [12 38 64 17  9 20]
 [ 6  6 15 66 27 24]
 [ 4  4 10 37 73 45]
 [ 7  8  8 19 24 92]]
ACCURACY SCORE:
0.4667
CLASSIFICATION REPORT:
	Precision: 0.4645
	Recall: 0.4677
	F1_Score: 0.4640


In [60]:
predictions = pipe.predict(X_pred)

predictions = pd.DataFrame(predictions,columns=['difficulty'])

predictions.to_csv("LogisticRegression.csv")

KNN (whithout data cleaning)

In [61]:
# Import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

# Define classifier
classifier_knn = KNeighborsClassifier()

# Create pipeline
pipe_knn = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier_knn)])

# Fit model on training set
pipe_knn.fit(X_train, y_train)

# Predictions
y_pred_knn = pipe_knn.predict(X_test)

evaluate(y_test, y_pred_knn)

CONFUSION MATRIX:
[[121  28   8   1   1   2]
 [ 98  51  12   1   1   1]
 [ 81  39  33   3   1   3]
 [ 49  30  19  29   3  14]
 [ 48  36  29  15  29  16]
 [ 37  29  17  23   9  43]]
ACCURACY SCORE:
0.3187
CLASSIFICATION REPORT:
	Precision: 0.4007
	Recall: 0.3183
	F1_Score: 0.3022


In [62]:
predictions_knn = pipe_knn.predict(X_pred)

predictions_knn = pd.DataFrame(predictions,columns=['difficulty'])

predictions_knn.to_csv("KNN.csv")

KNN improved

In [63]:
# Grid Search - hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# Define parameters to test
grid = {'n_neighbors':np.arange(1,100),
        'p':np.arange(1,3),
        'weights':['uniform','distance']}

# Define and fit model
knn = KNeighborsClassifier()
classifier_knn_plus = GridSearchCV(knn, grid, cv=10)

pipe_knn_plus = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier_knn_plus)])

pipe_knn_plus.fit(X_train, y_train)

print("Hyperparameters:", classifier_knn_plus.best_params_)

Hyperparameters: {'n_neighbors': 4, 'p': 2, 'weights': 'distance'}


In [71]:
# Predictions
y_pred_knn_plus = pipe_knn_plus.predict(X_test)

evaluate(y_test, y_pred_knn_plus)


CONFUSION MATRIX:
[[116  27  14   1   1   2]
 [ 79  64  15   4   1   1]
 [ 65  38  44   8   2   3]
 [ 37  25  23  40   2  17]
 [ 37  30  23  26  36  21]
 [ 34  22  19  13  17  53]]
ACCURACY SCORE:
0.3677
CLASSIFICATION REPORT:
	Precision: 0.4227
	Recall: 0.3678
	F1_Score: 0.3575


In [65]:
predictions_knn_plus = pipe_knn_plus.predict(X_pred)

predictions_knn_plus = pd.DataFrame(predictions,columns=['difficulty'])

predictions_knn_plus.to_csv("Knn_plus.csv")

Decision Tree Classifier (without data cleaning)

In [77]:
# Import Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

# Define classifier
classifier_dtc = DecisionTreeClassifier()

# Create pipeline
pipe_dtc = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier_dtc)])

# Fit model on training set
pipe_dtc.fit(X_train, y_train)

# Predictions
y_pred_dtc = pipe_dtc.predict(X_test)

evaluate(y_test, y_pred_dtc)

CONFUSION MATRIX:
[[81 38 23  7  5  7]
 [46 57 32 20  5  4]
 [25 39 39 24 17 16]
 [ 7 20 29 39 32 17]
 [12 18 28 36 45 34]
 [13 12 31 34 34 34]]
ACCURACY SCORE:
0.3073
CLASSIFICATION REPORT:
	Precision: 0.3063
	Recall: 0.3068
	F1_Score: 0.3038


In [67]:
predictions_dtc = pipe_dtc.predict(X_pred)

predictions_dtc = pd.DataFrame(predictions,columns=['difficulty'])

predictions_dtc.to_csv("DecisionTreeClassifier.csv")

Random Forest Classifier (without data cleaning)

In [68]:
# Use random forest
from sklearn.ensemble import RandomForestClassifier

# Define classifier
classifier_rfc = RandomForestClassifier()

# Create pipeline
pipe_rfc = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier_rfc)])

# Fit model on training set
pipe_rfc.fit(X_train, y_train)

# Predictions
y_pred_rfc = pipe_rfc.predict(X_test)

evaluate(y_test, y_pred_rfc)

CONFUSION MATRIX:
[[121  27   6   6   0   1]
 [ 77  53  23   9   1   1]
 [ 32  43  48  22   6   9]
 [ 16  15  15  70  16  12]
 [ 17   9  24  56  40  27]
 [ 13  14  17  26  28  60]]
ACCURACY SCORE:
0.4083
CLASSIFICATION REPORT:
	Precision: 0.4140
	Recall: 0.4120
	F1_Score: 0.3965


In [69]:
predictions_rfc = pipe_rfc.predict(X_pred)

predictions_rfc = pd.DataFrame(predictions,columns=['difficulty'])

predictions_rfc.to_csv("RandomForestClassifier.csv")