In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

df = pd.read_csv('train.tsv', sep='\t')

# General informations about the dataset

In [None]:
print(df.keys())
print(df['category'].value_counts())
print(df.head())

In [None]:
# only select 76 values for each category
df = df.groupby('category').head(76)

## Let's see how the length of the headlines and texts are distributed

In [None]:
df['headline'].apply(len).plot.hist(bins=50)

In [None]:
df["text"].apply(len).plot.hist(bins=200)

### Findings
- Url not relevant, we can drop this column
- Technology is clearly underrepresented in the dataset (1/5 of the support of other categories)
- texts lengths are in the range 0-10_000 characters, very small minority goes up to 40_000 chars.

We will need to take those informations into account to train the best model possible.

In [2]:
import nltk
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import string
nltk.download("stopwords")
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\spoto\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\spoto\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
X = df[['headline', 'text']]
y = df["category"]

X_train, y_train = X, y

NameError: name 'df' is not defined

## Vectorization with Stemming

In [6]:

# Tokenization function with stemming
stemmer = SnowballStemmer('french')

def tokenize_with_stemming(text):
    text = text.lower()  # Lowercasing
    tokens = word_tokenize(text, language='french')  # Tokenization
    tokens = [stemmer.stem(token) for token in tokens]  # Stemming
    return tokens


french_stopwords = stopwords.words(
    'french') + list(string.punctuation) + ["''", '``', '...', '’', '``', '«', '»', '``']

# in order for the stop words to be consistent with preprocessing
french_stopwords = [stemmer.stem(word) for word in french_stopwords]

vectorizer = TfidfVectorizer(tokenizer=tokenize_with_stemming,
                             stop_words=french_stopwords,
                             max_features=800)

# X_train_tfidf = vectorizer.fit_transform(
#     X_train['headline'] + ' ' + X_train['text'])

In [None]:
desc_bow = pd.DataFrame(X_train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
# desc_bow

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from imblearn.ensemble import BalancedRandomForestClassifier

col_trans = ColumnTransformer(
    [('headline', vectorizer, 'headline'),
     ('text', vectorizer, 'text')],
)

pipe = make_pipeline(col_trans, RandomForestClassifier())

In [None]:
pipe.fit(X_train, y_train)

In [None]:
df_test = pd.read_csv('test.tsv', sep='\t')

# show count of each category
print(df_test['category'].value_counts())

X_test = df_test[['headline', 'text']]
y_test = df_test["category"]

In [None]:
y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# Show wrong predictions
df_test['predicted_category'] = y_pred
df_test['correct'] = df_test['category'] == df_test['predicted_category']

df_test[df_test['correct'] == False].head()[['headline', "category", "predicted_category"]].to_clipboard()

In [None]:
import seaborn as sns
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=df['category'].unique(), yticklabels=df['category'].unique())
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

Let's see what type of model is better

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

models = [
    ('Baseline', DummyClassifier(strategy='most_frequent')),
    ('Multinomial NB', MultinomialNB()),
    ('CART', DecisionTreeClassifier()),
    ('LR', LogisticRegression()),
    ('KNN', KNeighborsClassifier()),
    ('Random forest', RandomForestClassifier())
]

for name, model in models:
    pipe = make_pipeline(col_trans, model)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print(f'{name}: {accuracy_score(y_test, y_pred)}')

# Cross validation for LR and Random Forest

In [None]:
from sklearn.model_selection import cross_val_score

def evaluate_model(name, model, X, y):
    pipe = make_pipeline(col_trans, model)
    scores = cross_val_score(pipe, X, y, cv=5)  # 5-fold cross-validation
    print(f'{name}: Mean Accuracy: {scores.mean()}')

evaluate_model("LR", LogisticRegression(), X_train, y_train)
evaluate_model("Random Forest", RandomForestClassifier(), X_train, y_train)


## Lemmatization

In [None]:
%%capture
!python -m spacy download fr_core_news_sm

In [None]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

nlp = spacy.load("fr_core_news_sm")

spacy_stopwords = nlp.Defaults.stop_words

In [None]:
# Custom tokenizer function using spaCy for tokenization and lemmatization
def spacy_tokenizer(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text.lower() not in spacy_stopwords and token.is_alpha]
    return tokens

vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer,
                             max_features=1000)

X = df[['headline', 'text']]
y = df["category"]

X_train_tfidf = vectorizer.fit_transform(X['headline'] + ' ' + X['text'])

In [None]:
desc_bow = pd.DataFrame(X_train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
desc_bow

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

col_trans = ColumnTransformer(
    [('headline', vectorizer, 'headline'),
     ('text', vectorizer, 'text')],
)

pipe = make_pipeline(col_trans, RandomForestClassifier())
pipe.fit(X_train, y_train)

In [12]:
import pandas as pd
df_test = pd.read_csv('test.tsv', sep='\t')

# show count of each category
print(df_test['category'].value_counts())

X_test = df_test[['headline', 'text']]
y_test = df_test["category"]

category
business      100
health        100
politics      100
sports        100
technology     22
Name: count, dtype: int64


In [None]:
y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
import seaborn as sns
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=df['category'].unique(), yticklabels=df['category'].unique())
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

models = [
    ('Baseline', DummyClassifier(strategy='most_frequent')),
    ('Multinomial NB', MultinomialNB()),
    ('CART', DecisionTreeClassifier(class_weight='balanced')),
    ('LR', LogisticRegression()),
    ('KNN', KNeighborsClassifier()),
    ('Random forest', RandomForestClassifier())
]

for name, model in models:
    pipe = make_pipeline(col_trans, model)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print(f'{name}: {accuracy_score(y_test, y_pred)}')

# Cross validation for LR and Random Forest

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

def evaluate_model(name, model, X, y):
    pipe = make_pipeline(col_trans, model)
    scores = cross_val_score(pipe, X, y, cv=5)  # 5-fold cross-validation
    print(f'{name}: Mean Accuracy: {scores.mean()}')

evaluate_model("LR", LogisticRegression(), X_test, y_test)





LR: Mean Accuracy: 0.8080952380952381


In [None]:
evaluate_model("LR", LogisticRegression(), X_train, y_train)

In [None]:
evaluate_model("Random Forest", RandomForestClassifier(), X_test, y_test)

In [None]:
evaluate_model("Random Forest", RandomForestClassifier(), X_train, y_train)

## TO DO : Different tokens frequency threshold (Tf-Idf)


## TO DO : la validation croisée stratifiée (pour les ensembles de données déséquilibrés en termes de distribution de classes, la validation croisée stratifiée garantit que chaque sous-ensemble de données conserve la même distribution de classes que l'ensemble de données original.)


## TO DO : autres metriques pour comparer les modeles avec la CV (rapport)