In [1]:
import os

if os.getcwd().endswith('notebooks'):
    os.chdir('..')

In [2]:
import pandas as pd
from src.features import KeywordProcessor

from loguru import logger

from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC

In [3]:
def compute_cv_results(model, X, y, model_name=None, scoring=None, cv=5, n_jobs=-1):
    """Compute aggregated metrics using cross-validation"""
    if scoring is None:
        scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1', 'roc_auc']
    
    cv_results = cross_validate(model, X, y, scoring=scoring, cv=cv, n_jobs=n_jobs)
    cv_results_agg = pd.DataFrame(cv_results).mean()
    
    if model_name:
        cv_results_agg = cv_results_agg.rename(model_name)

    return cv_results_agg


In [4]:
# Load dataset
df_train = pd.read_csv('data/train.csv', index_col='id')
df_test = pd.read_csv('data/test.csv', index_col='id')

# Split dataset
X = df_train.drop(columns='target')
y = df_train['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
## Keeping it for reference :)

# Define the keyword processing pipeline
keyword_processor = Pipeline(steps=[
    ('keyword_processor', KeywordProcessor()),  # Custom keyword processing transformer
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # One-hot encoding for processed keywords
])

# Define the text processing component using TfidfVectorizer
text_processor = TfidfVectorizer(stop_words='english')  # Vectorization while removing English stop words

# Combine keyword and text processing in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('keyword_processor', keyword_processor, ['keyword']),  # Apply keyword processing to 'keyword' column
        ('text_preprocessor', text_processor, 'text'),  # Apply text vectorization to 'text' column
    ]
)

# Define the complete model pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing step for both keywords and text
    ('clf', LogisticRegressionCV(class_weight='balanced'))  # Logistic regression classifier with balanced class weights
])

# Fit the model pipeline to the training data
model_pipeline.fit(X_train, y_train)

# Make predictions on the validation data
y_pred = model_pipeline.predict(X_val)

# Print the classification report to evaluate model performance
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.79      0.80       874
           1       0.73      0.76      0.74       649

    accuracy                           0.78      1523
   macro avg       0.77      0.77      0.77      1523
weighted avg       0.78      0.78      0.78      1523



In [6]:
## --  Baseline Model -- ##
baseline_preprocessor = ColumnTransformer([
    ('text_processor', CountVectorizer(), 'text')
])

baseline_model = Pipeline(steps=[
    ('preprocessor', baseline_preprocessor),
    ('clf', LogisticRegression(random_state=42))
])

## -- Common Preprocessor -- ##

# Define the keyword processing pipeline
keyword_processor = Pipeline(steps=[
    ('keyword_processor', KeywordProcessor()),           # Custom keyword processing transformer
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # One-hot encoding for processed keywords
])

# Define the text processing component using TfidfVectorizer
text_processor = TfidfVectorizer(stop_words='english')   # Vectorization while removing English stop words

# Combine keyword and text processing in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('keyword_processor', keyword_processor, ['keyword']),  # Apply keyword processing to 'keyword' column
        ('text_preprocessor', text_processor, 'text'),          # Apply text vectorization to 'text' column
    ]
)

## -- Logistic Regression -- ##

lr_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', LogisticRegressionCV(class_weight='balanced', random_state=42))
])

## -- SVM -- ##

svc_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', SVC(random_state=42))
])

## -- Catboost -- ##

lgbm_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', LGBMClassifier(verbosity=-1, random_seed=42))
])

In [7]:
models = dict(
    baseline = baseline_model,
    lr = lr_model,
    svc = svc_model,
    lgbm = lgbm_model
)

In [8]:
cv_results = []

for model_name,model in models.items():
    logger.info(f'Training model {model_name}')
    cv_result = compute_cv_results(model, X, y, model_name=model_name)
    cv_results.append(cv_result)

logger.success('All models trained!')

df_cv_results = pd.DataFrame(cv_results).sort_values('test_balanced_accuracy', ascending=False)

[32m2024-10-16 21:34:03.600[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mTraining model baseline[0m
[32m2024-10-16 21:34:04.548[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mTraining model lr[0m
[32m2024-10-16 21:34:06.388[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mTraining model svc[0m
[32m2024-10-16 21:34:11.778[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mTraining model lgbm[0m
[32m2024-10-16 21:36:57.769[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [32m[1mAll models trained![0m


In [9]:
fit_score_cols = ['fit_time','score_time']
metric_cols = [col for col in df_cv_results.columns if col not in fit_score_cols]

In [10]:
(
    df_cv_results
    .style
    .highlight_min(subset=fit_score_cols)
    .highlight_max(subset=metric_cols)
)

Unnamed: 0,fit_time,score_time,test_accuracy,test_balanced_accuracy,test_precision,test_recall,test_f1,test_roc_auc
baseline,0.160165,0.028522,0.709845,0.692103,0.704152,0.565896,0.624368,0.752665
lgbm,164.179091,0.188268,0.637475,0.606341,0.636139,0.384916,0.475952,0.667382
svc,3.311143,1.206605,0.620004,0.604236,0.579266,0.491939,0.520372,0.571464
lr,1.077388,0.119373,0.604637,0.602035,0.540179,0.583347,0.554771,0.570438
