In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Load the datasets
training_data_path = '/content/drive/MyDrive/training_data.csv'
unlabelled_test_data_path = '/content/drive/MyDrive/unlabelled_test_data.csv'
sample_submission_path = '/content/drive/MyDrive/sample_submission.csv'

# Read the datasets
training_data = pd.read_csv(training_data_path)
unlabelled_test_data = pd.read_csv(unlabelled_test_data_path)
sample_submission = pd.read_csv(sample_submission_path)

# Display

In [None]:
display(training_data)
display(unlabelled_test_data)
display(sample_submission)

Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1
...,...,...,...
4795,4795,"C'est pourquoi, il décida de remplacer les hab...",B2
4796,4796,Il avait une de ces pâleurs splendides qui don...,C1
4797,4797,"Et le premier samedi de chaque mois, venez ren...",A2
4798,4798,Les coûts liés à la journalisation n'étant pas...,C2


Unnamed: 0,id,sentence
0,0,Nous dûmes nous excuser des propos que nous eû...
1,1,Vous ne pouvez pas savoir le plaisir que j'ai ...
2,2,"Et, paradoxalement, boire froid n'est pas la b..."
3,3,"Ce n'est pas étonnant, car c'est une saison my..."
4,4,"Le corps de Golo lui-même, d'une essence aussi..."
...,...,...
1195,1195,C'est un phénomène qui trouve une accélération...
1196,1196,Je vais parler au serveur et voir si on peut d...
1197,1197,Il n'était pas comme tant de gens qui par pare...
1198,1198,Ils deviennent dangereux pour notre économie.


Unnamed: 0,id,difficulty
0,0,A1
1,1,A1
2,2,A1
3,3,A1
4,4,A1
...,...,...
1195,1195,A1
1196,1196,A1
1197,1197,A1
1198,1198,A1


# KNN

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load the training data
training_data = pd.read_csv('/content/drive/MyDrive/training_data.csv')

# Separate features and labels
X = training_data['sentence']  # Features (text data)
y = training_data['difficulty']  # Labels (difficulty level)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Text Vectorization
vectorizer = TfidfVectorizer()
X_train_transformed = vectorizer.fit_transform(X_train)
X_val_transformed = vectorizer.transform(X_val)

# KNN Model
knn = KNeighborsClassifier()

# Hyperparameter tuning
param_grid = {'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance']}
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_transformed, y_train)

# Best KNN model
best_knn = grid_search.best_estimator_

# Predictions and evaluation
y_pred = best_knn.predict(X_val_transformed)
report = classification_report(y_val, y_pred, target_names=label_encoder.classes_, output_dict=True)

In [None]:
# Extract weighted average values for precision, recall, and F1-score
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = accuracy_score(y_val, y_pred)

In [None]:
# Creating a DataFrame to display results
results_df = pd.DataFrame({
    "Model": ["KNN"],
    "Precision": [precision],
    "Recall": [recall],
    "F1-Score": [f1_score],
    "Accuracy": [accuracy]
})

# Formatting and displaying the DataFrame
results_df.set_index("Model", inplace=True)
print(results_df)

       Precision    Recall  F1-Score  Accuracy
Model                                         
KNN     0.404224  0.358333   0.34642  0.358333


# Logistic Regression Model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load the training data
training_data = pd.read_csv('/content/drive/MyDrive/training_data.csv')

# Separate features and labels
X = training_data['sentence']  # Features (text data)
y = training_data['difficulty']  # Labels (difficulty level)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Text Vectorization
vectorizer = TfidfVectorizer()
X_train_transformed = vectorizer.fit_transform(X_train)
X_val_transformed = vectorizer.transform(X_val)

# Logistic Regression Model
logistic_model = LogisticRegression(max_iter=1000)

# Hyperparameter tuning
param_grid = {'C': [0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']}  # Adjust based on Logistic Regression needs
grid_search = GridSearchCV(logistic_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_transformed, y_train)

# Best Logistic Regression model
best_logistic = grid_search.best_estimator_

# Predictions and evaluation
y_pred = best_logistic.predict(X_val_transformed)
report = classification_report(y_val, y_pred, target_names=label_encoder.classes_, output_dict=True)

# Extract weighted average values for precision, recall, and F1-score
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = accuracy_score(y_val, y_pred)

# Creating a DataFrame to display results
results_df = pd.DataFrame({
    "Model": ["Logistic Regression"],
    "Precision": [precision],
    "Recall": [recall],
    "F1-Score": [f1_score],
    "Accuracy": [accuracy]
})

# Formatting and displaying the DataFrame
results_df.set_index("Model", inplace=True)
print(results_df)

                     Precision    Recall  F1-Score  Accuracy
Model                                                       
Logistic Regression   0.464049  0.466667  0.462684  0.466667


# Decision Tree

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load the training data
training_data = pd.read_csv('/content/drive/MyDrive/training_data.csv')

# Separate features and labels
X = training_data['sentence']  # Features (text data)
y = training_data['difficulty']  # Labels (difficulty level)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Text Vectorization
vectorizer = TfidfVectorizer()
X_train_transformed = vectorizer.fit_transform(X_train)
X_val_transformed = vectorizer.transform(X_val)

# Decision Tree Model
decision_tree = DecisionTreeClassifier()

# Hyperparameter tuning
param_grid = {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
grid_search = GridSearchCV(decision_tree, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_transformed, y_train)

# Best Decision Tree model
best_decision_tree = grid_search.best_estimator_

# Predictions and evaluation
y_pred = best_decision_tree.predict(X_val_transformed)
report = classification_report(y_val, y_pred, target_names=label_encoder.classes_, output_dict=True)

# Extract weighted average values for precision, recall, and F1-score
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = accuracy_score(y_val, y_pred)

# Creating a DataFrame to display results
results_df = pd.DataFrame({
    "Model": ["Decision Tree"],
    "Precision": [precision],
    "Recall": [recall],
    "F1-Score": [f1_score],
    "Accuracy": [grid_search.best_score_]
})

# Formatting and displaying the DataFrame
results_df.set_index("Model", inplace=True)
print(results_df)

               Precision    Recall  F1-Score  Accuracy
Model                                                 
Decision Tree   0.289112  0.294792  0.288156  0.327865


# Random Forest

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load the training data
training_data = pd.read_csv('/content/drive/MyDrive/training_data.csv')

# Separate features and labels
X = training_data['sentence']  # Features (text data)
y = training_data['difficulty']  # Labels (difficulty level)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Text Vectorization
vectorizer = TfidfVectorizer()
X_train_transformed = vectorizer.fit_transform(X_train)
X_val_transformed = vectorizer.transform(X_val)

# Random Forest Model
random_forest = RandomForestClassifier()

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]     # Minimum number of samples required to be at a leaf node
}
grid_search = GridSearchCV(random_forest, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_transformed, y_train)

# Best Random Forest model
best_random_forest = grid_search.best_estimator_

# Predictions and evaluation
y_pred = best_random_forest.predict(X_val_transformed)
report = classification_report(y_val, y_pred, target_names=label_encoder.classes_, output_dict=True)

# Extract weighted average values for precision, recall, and F1-score
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1_score = report['weighted avg']['f1-score']
accuracy = accuracy_score(y_val, y_pred)

# Creating a DataFrame to display results
results_df = pd.DataFrame({
    "Model": ["Random Forest"],
    "Precision": [precision],
    "Recall": [recall],
    "F1-Score": [f1_score],
    "Accuracy": [accuracy]
})

# Formatting and displaying the DataFrame
results_df.set_index("Model", inplace=True)
print(results_df)

               Precision    Recall  F1-Score  Accuracy
Model                                                 
Random Forest   0.407051  0.404167  0.391149  0.404167


In [None]:
# Optionally display best parameters and their score
print("Best Random Forest Parameters:", grid_search.best_params_)
print("Best Random Forest Accuracy:", grid_search.best_score_)


# BERT Model

Which is the best model?

Show the confusion matrix.

Show examples of some erroneous predictions. Can you understand where the error is coming from?

Do some more analysis to better understand how your model behaves.

Have a position on the leaderboard of this competition.