In [1]:
import os

if os.getcwd().endswith('notebooks'):
    os.chdir('..')

In [2]:
import pandas as pd
from src.features import KeywordProcessor

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from lightgbm import LGBMClassifier

In [3]:
# Load dataset
df_train = pd.read_csv('data/train.csv', index_col='id')
df_test = pd.read_csv('data/test.csv', index_col='id')

# Split dataset
X = df_train.drop(columns='target')
y = df_train['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Define the keyword processing pipeline
keyword_processor = Pipeline(steps=[
    ('keyword_processor', KeywordProcessor()),  # Custom keyword processing transformer
    ('encoder', OneHotEncoder())  # One-hot encoding for processed keywords
])

# Define the text processing component using TfidfVectorizer
text_processor = TfidfVectorizer(stop_words='english')  # Vectorization while removing English stop words

# Combine keyword and text processing in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('keyword_processor', keyword_processor, ['keyword']),  # Apply keyword processing to 'keyword' column
        ('text_preprocessor', text_processor, 'text'),  # Apply text vectorization to 'text' column
    ]
)

# Define the complete model pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing step for both keywords and text
    ('clf', LogisticRegressionCV(class_weight='balanced'))  # Logistic regression classifier with balanced class weights
])

# Fit the model pipeline to the training data
model_pipeline.fit(X_train, y_train)

# Make predictions on the validation data
y_pred = model_pipeline.predict(X_val)

# Print the classification report to evaluate model performance
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.79      0.80       874
           1       0.73      0.76      0.74       649

    accuracy                           0.78      1523
   macro avg       0.77      0.77      0.77      1523
weighted avg       0.78      0.78      0.78      1523

