In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import joblib

In [None]:

train_data = pd.read_csv("train_submission.csv", na_filter=False)
test_data = pd.read_csv("test_without_labels.csv")

In [None]:


import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"\d+", "", text)  # Remove digits
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])  # Lemmatization + Stopwords
    return text.strip()


train_data["Text"] = train_data["Text"].apply(preprocess_text)
test_data["Text"] = test_data["Text"].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...


In [None]:

X_train, X_valid, y_train, y_valid = train_test_split(train_data['Text'], train_data['Label'], test_size=0.2, random_state=42)

In [7]:
# Création du pipeline avec TfidfVectorizer et LogisticRegression
pipeline = Pipeline([
    ('tfidf',TfidfVectorizer(
    analyzer="word",  # Switch from char to word-level analysis
    ngram_range=(1, 3),  # Include unigrams, bigrams, trigrams
    max_features=20000,  # Increase feature count
    stop_words="english"  # Remove common words
)),
    ('clf', LogisticRegression())
])

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
param_distributions = {
    'tfidf__ngram_range': [(1, 2), (1, 3), (1, 4), (2, 3), (2, 4)],
    'tfidf__max_features': randint(5000, 15000),
    'clf__C': uniform(0.1, 10)
}

random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=15,  
    cv=3,       
    scoring='accuracy',
    verbose=3,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)
print("Best Hyperparameters:", random_search.best_params_)

Fitting 3 folds for each of 15 candidates, totalling 45 fits




Best Hyperparameters: {'clf__C': 9.83755518841459, 'tfidf__max_features': 14998, 'tfidf__ngram_range': (1, 4)}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)
print(f"Accuracy on validation Set: {accuracy:.3f}")

Accuracy on validation Set: 0.686
