In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import joblib

In [None]:
# Chargement des données d'entraînement et de test
train_data = pd.read_csv("train_submission.csv", na_filter=False)
test_data = pd.read_csv("test_without_labels.csv")

In [None]:
import re
#Pretraitement du dataset
def preprocess_text(text):
    # Convertir en minuscules
    text = text.lower()
    # Supprimer la ponctuation et les caractères spéciaux
    text = re.sub(r'[^\w\s]', '', text)
    return text

train_data['Text'] = train_data['Text'].apply(preprocess_text)
test_data['Text'] = test_data['Text'].apply(preprocess_text)

In [None]:
! pip install --upgrade scikit-learn==1.6.1


In [None]:
# Séparation des données en ensembles d'entraînement et de test
X_train, X_valid, y_train, y_valid = train_test_split(train_data['Text'], train_data['Label'], test_size=0.2, random_state=42)

In [5]:
# Création du pipeline avec TfidfVectorizer et LogisticRegression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='char')),
    ('clf', LogisticRegression())
])

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
# definition des parametres
param_distributions = {
    'tfidf__ngram_range': [(1, 2), (1, 3), (1, 4), (2, 3), (2, 4)],
    'tfidf__max_features': randint(5000, 15000),
    'clf__C': uniform(0.1, 10)
}

# utilisation de RandomizedSearchCV
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=10,  # nombre de combinaison a essayer
    cv=3,      
    scoring='accuracy',
    verbose=3,
    n_jobs=-1, #utiliser 100% du cpu
    random_state=42
)

# entrainement
random_search.fit(X_train, y_train)
# meuilleurs hyperparametres
print("Best Hyperparameters:", random_search.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits




Best Hyperparameters: {'clf__C': 9.83755518841459, 'tfidf__max_features': 14998, 'tfidf__ngram_range': (1, 4)}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [4]:
import sklearn
import pandas as pd

print("scikit-learn version:", sklearn.__version__)
print("pandas version:", pd.__version__)

scikit-learn version: 1.6.1
pandas version: 2.2.3


In [None]:


# evaluation sur le set de validation
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)
print(f"Accuracy on validation Set: {accuracy:.3f}")

Accuracy on Test Set: 0.839


In [None]:
# prediction des labels sur le test set
test_data["ID"] = test_data.index + 1  # IDs uniques
test_data["Label"] = best_model.predict(test_data["Text"])

# sauvegarder les resultats dans un csv file
test_data[["ID", "Label"]].to_csv("test_predictions3.csv", index=False)

print(test_data.head())

     Usage                                               Text  ID Label
0  Private  hüttwilen el xe on comune del canton turgovia ...   1   ven
1  Private  la leĝo zorgas pri kompenso de nur la plej gra...   2   epo
2  Private               پک اپ پر اپنے ڈرائیور سے پہلے پہنچیں   3   urd
3  Private  mukmu  chitana mukmu icha butun nisqaqa nisqaq...   4   quy
4  Private  iwe   lon ena fansoun   lupwen ra aleani än mo...   5   chk
