# Task blablabla

import all needed libraries:

In [76]:
import os 
import pandas as pd
import numpy as np
from langdetect import detect
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression

# set random seed
seed = 666


## Inital data preprocessing 

Load all csv files and merge them all into one data frame:

In [77]:
folder_path = "./data" 
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

In [78]:
df = pd.concat([pd.read_csv(os.path.join(folder_path, f)) for f in csv_files],
                      ignore_index=True)

# show the some of the df data
df.head()

Unnamed: 0,title,perex,label
0,Realu se vážně zranil Asensio. Ceballos by měl...,Real Madrid potenciálně na celý ročník 2019/20...,Injuries
1,Problémy s koleny dohnaly Richardse k brzkému ...,Už ve 31 letech dohnaly neustálé problémy s ko...,Injuries
2,Verletzungspech bei Real - Kreuzband-Schock fü...,Schock bei Real Madrid! Marco Asensio erlitt i...,Injuries
3,Man Utd star Eric Bailly to miss ‘at least six...,MANCHESTER UNITED star Eric Bailly is set to m...,Injuries
4,Вутов под въпрос за мача с Левски,Един от основните футболисти на Ботев (Пловдив...,Injuries


In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   1500 non-null   object
 1   perex   1499 non-null   object
 2   label   1500 non-null   object
dtypes: object(3)
memory usage: 35.3+ KB


Preprocess the data:

Show number of NaN values:

In [80]:
print(df.isna().sum())

title    0
perex    1
label    0
dtype: int64


Replace NaN values with "" for better data consistency:

In [81]:
df['perex'] = df['perex'].fillna("") 
print(df.isna().sum())

title    0
perex    0
label    0
dtype: int64


Generate more parameters:

In [82]:
# add perex world count
df['perex_word_count'] = df['perex'].str.split().apply(len)

In [83]:
df.head()

Unnamed: 0,title,perex,label,perex_word_count
0,Realu se vážně zranil Asensio. Ceballos by měl...,Real Madrid potenciálně na celý ročník 2019/20...,Injuries,40
1,Problémy s koleny dohnaly Richardse k brzkému ...,Už ve 31 letech dohnaly neustálé problémy s ko...,Injuries,34
2,Verletzungspech bei Real - Kreuzband-Schock fü...,Schock bei Real Madrid! Marco Asensio erlitt i...,Injuries,24
3,Man Utd star Eric Bailly to miss ‘at least six...,MANCHESTER UNITED star Eric Bailly is set to m...,Injuries,57
4,Вутов под въпрос за мача с Левски,Един от основните футболисти на Ботев (Пловдив...,Injuries,22


Try adding column representing language of the text:

In [84]:
# detect language from title and perex
df["language"] = (df['title'] + " " + df['perex']).apply(lambda x: detect(x) if x else 'unknown')

We got 27 distinct languages detected with 0 NaN values:

In [85]:
print(df["language"].unique())
print(len(df["language"].unique()))
print(df.isna().sum())
print(df.info())

['cs' 'de' 'en' 'bg' 'pl' 'nl' 'it' 'fr' 'el' 'pt' 'ru' 'sv' 'hu' 'hr'
 'ja' 'sk' 'vi' 'id' 'ro' 'sl' 'no' 'ko' 'da' 'tr' 'fi' 'et' 'ca' 'lt']
28
title               0
perex               0
label               0
perex_word_count    0
language            0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   title             1500 non-null   object
 1   perex             1500 non-null   object
 2   label             1500 non-null   object
 3   perex_word_count  1500 non-null   int64 
 4   language          1500 non-null   object
dtypes: int64(1), object(4)
memory usage: 58.7+ KB
None


One-hot encode the 'language' column

In [86]:
df = pd.get_dummies(df, columns=['language'], prefix='lang', dummy_na=False)
df.head()

Unnamed: 0,title,perex,label,perex_word_count,lang_bg,lang_ca,lang_cs,lang_da,lang_de,lang_el,...,lang_no,lang_pl,lang_pt,lang_ro,lang_ru,lang_sk,lang_sl,lang_sv,lang_tr,lang_vi
0,Realu se vážně zranil Asensio. Ceballos by měl...,Real Madrid potenciálně na celý ročník 2019/20...,Injuries,40,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Problémy s koleny dohnaly Richardse k brzkému ...,Už ve 31 letech dohnaly neustálé problémy s ko...,Injuries,34,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Verletzungspech bei Real - Kreuzband-Schock fü...,Schock bei Real Madrid! Marco Asensio erlitt i...,Injuries,24,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,Man Utd star Eric Bailly to miss ‘at least six...,MANCHESTER UNITED star Eric Bailly is set to m...,Injuries,57,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Вутов под въпрос за мача с Левски,Един от основните футболисти на Ботев (Пловдив...,Injuries,22,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


Remove punctuation from title and perex columns:

In [89]:
import regex as re


def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    
    # Remove punctuation but keep all letters/numbers
    text = re.sub(r'[^\p{L}\d\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text.lower()


In [90]:
df['title'] = df['title'].apply(preprocess_text)
df['perex'] = df['perex'].apply(preprocess_text)

In [91]:
df['perex'].head()

0    real madrid potenciálně na celý ročník 2019 20...
1    už ve 31 letech dohnaly neustálé problémy s ko...
2    schock bei real madrid marco asensio erlitt im...
3    manchester united star eric bailly is set to m...
4    един от основните футболисти на ботев пловдив ...
Name: perex, dtype: object

Split dataframe into y(target value) and X(parameters) :

In [92]:
Xdata = df.drop(columns=['label'])
y = df['label']

Transform labels into numerics:

In [93]:
le = LabelEncoder()
y = le.fit_transform(y)
print(y)


[0 0 0 ... 5 5 5]


Split data into train,val,test 

In [94]:
Xtrain_raw, Xtest_raw, ytrain, ytest = train_test_split(Xdata, y, test_size=0.4, random_state=seed) 
Xtest_raw, Xval_raw, ytest, yval = train_test_split (Xtest_raw,ytest, test_size= 0.5, random_state=seed)


Transform the rest:

In [95]:
#scale numerical features
SS = StandardScaler()
Xtrain_num = SS.fit_transform(Xtrain_raw[['perex_word_count']])
Xval_num = SS.transform(Xval_raw[['perex_word_count']])
Xtest_num = SS.transform(Xtest_raw[['perex_word_count']])
Xtrain_num.shape


(900, 1)

In [96]:
# TF-IDF Vectorization for 'title' and 'perex'
title_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1,2))
perex_vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))

# Fit on training data
Xtrain_title = title_vectorizer.fit_transform(Xtrain_raw['title'])
Xtrain_perex = perex_vectorizer.fit_transform(Xtrain_raw['perex'])

# Transform validation and test data
Xval_title = title_vectorizer.transform(Xval_raw['title'])
Xval_perex = perex_vectorizer.transform(Xval_raw['perex'])

Xtest_title = title_vectorizer.transform(Xtest_raw['title'])
Xtest_perex = perex_vectorizer.transform(Xtest_raw['perex'])

combine all features:

In [97]:
from scipy.sparse import hstack,csr_matrix
import numpy as np


Xtrain_lang = Xtrain_raw.filter(like='lang_').to_numpy()
Xval_lang = Xval_raw.filter(like='lang_').to_numpy()
Xtest_lang = Xtest_raw.filter(like='lang_').to_numpy()
#convert numerical and language features to sparse matrices
Xtrain_num_sparse = csr_matrix(Xtrain_num)
Xval_num_sparse = csr_matrix(Xval_num)
Xtest_num_sparse = csr_matrix(Xtest_num)

Xtrain_lang_sparse = csr_matrix(Xtrain_lang)
Xval_lang_sparse = csr_matrix(Xval_lang)
Xtest_lang_sparse = csr_matrix(Xtest_lang)

Xtrain_full = hstack([Xtrain_title, Xtrain_perex, Xtrain_num,Xtrain_lang])
Xval_full = hstack([Xval_title, Xval_perex, Xval_num,Xval_lang])
Xtest_full = hstack([Xtest_title, Xtest_perex, Xtest_num,Xtest_lang])

In [98]:
Xtrain_full.shape

(900, 3029)

##  Train Different models:

### logistic regression model: 

In [None]:
param_grid_logr = {
    'max_iter' : [1200,1500,2000],
    'solver': ['saga'],
    'C': np.arange(0.1, 2.0, 0.2),
    'n_jobs': [-1]
}

param_comb = ParameterGrid(param_grid_logr)
val_metric = []
for params in param_comb:
    clf_logr = LogisticRegression(**params)
    clf_logr.fit(Xtrain_full,ytrain)
    val_metric.append(clf_logr.score(Xval_full, yval))



In [100]:
best_params_logr = param_comb[np.argmax(val_metric)]
clf_logr_fc = LogisticRegression(**best_params_logr)
clf_logr_fc.fit(Xtrain_full,ytrain)
print(f"We found the best params {best_params_logr} with accuracy score {max(val_metric):.5f}.")

We found the best params {'solver': 'saga', 'n_jobs': -1, 'max_iter': 1200, 'C': np.float64(1.9000000000000004)} with accuracy score 0.80333.


We got ~80% accuracy, now try different models 

### Decision tree model

In [101]:
from sklearn.tree import DecisionTreeClassifier

param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(10, 50, 5),
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

param_comb = ParameterGrid(param_grid_dt)
val_metric = []
for params in param_comb:
    clf_dt = DecisionTreeClassifier(**params)
    clf_dt.fit(Xtrain_full,ytrain)
    val_metric.append(clf_dt.score(Xval_full, yval))


In [102]:
best_params_dt = param_comb[np.argmax(val_metric)]
clf_dt_fc = DecisionTreeClassifier(**best_params_dt)
clf_dt_fc.fit(Xtrain_full,ytrain)
print(f"We found the best params {best_params_dt} with accuracy score {max(val_metric):.5f}.")

We found the best params {'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 10, 'criterion': 'entropy'} with accuracy score 0.75333.


Decision tree didn't show a better result with only ~75.3% accuracy

### SVM model:

In [103]:
from sklearn.svm import SVC

param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': range (2,10,2),
    'C': np.arange(0.1, 2.0, 0.2)
}
#find the best parameters
param_comb = ParameterGrid(param_grid)
val_metric = []
for params in param_comb:
    clf_SVC = SVC(**params)
    clf_SVC.fit(Xtrain_full,ytrain)
    val_metric.append(clf_SVC.score(Xval_full, yval))
    



In [104]:
best_params_SVC = param_comb[np.argmax(val_metric)]
clf_SVC_fc = SVC(**best_params_SVC)
clf_SVC_fc.fit(Xtrain_full,ytrain)
print(f"We found the best params {best_params_SVC} with accuracy score {max(val_metric):.5f}.")

We found the best params {'kernel': 'linear', 'degree': 2, 'C': np.float64(0.7000000000000001)} with accuracy score 0.83667.


SVC model showed the best result by far for now with ~83.6% accuracy score 

### Naive bayes model:

MultinomialNB is the sparse-friendly classifier,for it we'll use only TF-IDF features perex and title

In [107]:
Xtrain_text = hstack([Xtrain_title, Xtrain_perex])
Xval_text = hstack([Xval_title, Xval_perex])
Xtest_text = hstack([Xtest_title, Xtest_perex])

In [113]:
from sklearn.naive_bayes import MultinomialNB
param_grid = {
    'alpha': np.arange(0.01, 2.0, 0.1),
}

param_comb = ParameterGrid(param_grid)
val_metric = []
for params in param_comb:
    clf_MNB = MultinomialNB(**params)
    clf_MNB.fit(Xtrain_text, ytrain)
    val_metric.append(clf_MNB.score(Xval_text, yval))


In [114]:
best_params_MNB = param_comb[np.argmax(val_metric)]
clf_MNB_fc = MultinomialNB(**best_params_MNB)
clf_MNB_fc.fit(Xtrain_text,ytrain)
print(f"We found the best params {best_params_MNB} with accuracy score {max(val_metric):.5f}.")

We found the best params {'alpha': np.float64(0.31000000000000005)} with accuracy score 0.82667.


Naive Bayes model showed the second best accuracy score of 82.6%

### Try applying TruncatedSVD method

In [124]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=200, random_state=seed)
# transform text features using SVD
Xtrain_text_svd = svd.fit_transform(Xtrain_text)
Xval_text_svd = svd.transform(Xval_text)


#### Logistic Regression model:

In [125]:
param_grid_logr = {
    'max_iter' : [1200,1500,2000],
    'solver': ['saga'],
    'C': np.arange(0.1, 2.0, 0.2),
    'n_jobs': [-1]
}

param_comb = ParameterGrid(param_grid_logr)
val_metric = []
for params in param_comb:
    clf_logr_svd = LogisticRegression(**params)
    clf_logr_svd.fit(Xtrain_text_svd,ytrain)
    val_metric.append(clf_logr_svd.score(Xval_text_svd, yval))



In [None]:
best_params_logr_svd = param_comb[np.argmax(val_metric)]
clf_logr_svd_fc = LogisticRegression(**best_params_logr_svd)
clf_logr_svd_fc.fit(Xtrain_text_svd,ytrain)
print(f"We found the best params {best_params_logr_svd} with accuracy score {max(val_metric):.5f}.")

We found the best params {'solver': 'saga', 'n_jobs': -1, 'max_iter': 1200, 'C': np.float64(1.9000000000000004)} with accuracy score 0.78000.


SVD transform didn't improve the logistic regression model

#### SVM model with SVD transform

In [127]:


param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': range (2,10,2),
    'C': np.arange(0.1, 2.0, 0.2)
}
#find the best parameters
param_comb = ParameterGrid(param_grid)
val_metric = []
for params in param_comb:
    clf_SVC_svd = SVC(**params)
    clf_SVC_svd.fit(Xtrain_text_svd,ytrain)
    val_metric.append(clf_SVC_svd.score(Xval_text_svd, yval))
    



In [129]:
best_params_SVC_svd = param_comb[np.argmax(val_metric)]
clf_SVC_svd_fc = SVC(**best_params_SVC_svd)
clf_SVC_svd_fc.fit(Xtrain_text_svd,ytrain)
print(f"We found the best params {best_params_SVC_svd} with accuracy score {max(val_metric):.5f}.")

We found the best params {'kernel': 'rbf', 'degree': 2, 'C': np.float64(1.7000000000000004)} with accuracy score 0.81667.


SVC model result also didn't improve with SVD method

## Conclusion:

from all attempted models, simple SVC model got the best accuracy score of ~83.6%, therefore it will be the final model

In [137]:
clf_final = clf_SVC_fc

# get the test accuracy
print(f" Accuracy score on the test data for the final model is: {clf_final.score(Xtest_full, ytest):.5f}.")


 Accuracy score on the test data for the final model is: 0.78000.


Accuracy of the final model is 0.78%