In [1]:
import sys
from pathlib import Path

project_root = Path().resolve().parent   
sys.path.insert(0, str(project_root / "src"))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import List, Tuple
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, average_precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")
import os
import joblib 
from evaluator import ModelEvaluator, evaluate_param_list, average_precision_score_1
from get_cv_split import PredefinedKFold
import shap




In [3]:
train_df = pd.read_parquet("../Data/train_final_v3.parquet")
val_df = pd.read_parquet("../Data/validationA_v3.parquet")

In [4]:
X_train = train_df.drop(columns=['match'], axis=1)
X_val = val_df.drop(columns=['match'], axis=1)
y_train = train_df['match']
y_val = val_df['match']

## Columns Type Updated

In [5]:
for df in (X_train, X_val):
    df['review_span'] = (df['max_date'] - df['min_date']).dt.days
    df.drop(['min_date', 'max_date'], axis=1, inplace=True)

for df in (X_train, X_val):
    df['missing_price'] = df['missing_price'].astype(int)

for df in (X_train, X_val):
    df['product_lifespan_days'] = df['product_lifespan'].dt.days
    df.drop('product_lifespan', axis=1, inplace=True)

print(f"The shape of X_train: {X_train.shape}, X_val: {X_val.shape}")

The shape of X_train: (112768, 789), X_val: (24164, 789)


In [6]:
for df in (X_train, X_val):
    df.drop(['percent_positive', 'percent_negative', 'unique_reviewer_count', 'review_span'], axis=1, inplace=True)

In [7]:
def make_transformer(df,r, s, drop_first=True):
    rev_cols  = [c for c in df if c.startswith("embedding_")]
    summ_cols = [c for c in df if c.startswith("embed_")]
    numeric_cols  = [c for c in df if c not in rev_cols+summ_cols+["category"]]

    rev_pipe  = ("drop" if r == 0 else Pipeline([("scale",StandardScaler()), ("pca",PCA(n_components=r,random_state=42))]))
    sum_pipe  = ("drop" if s == 0 else Pipeline([("scale",StandardScaler()), ("pca",PCA(n_components=s,random_state=42))]))

    return ColumnTransformer(
        [('num', StandardScaler(), numeric_cols),
         ('cat', OneHotEncoder(handle_unknown="ignore", drop="first" if drop_first else None, sparse_output=False), ["category"]),
         ('rev', rev_pipe,  rev_cols),
         ('sum', sum_pipe,  summ_cols)
        ]).set_output(transform="pandas")

In [8]:
preprocessor = make_transformer(X_train, 0.95, 0.95)
X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc = preprocessor.transform(X_val)

In [9]:
# Column Types
num_cols = [c for c in X_train_proc.columns if c.startswith("num__")]
cat_cols       = [c for c in X_train_proc.columns if c.startswith("cat__")]
rev_cols       = [c for c in X_train_proc.columns if c.startswith("rev__")]
sum_cols       = [c for c in X_train_proc.columns if c.startswith("sum__")]

## Custom KFold

In [10]:
split_data=pd.read_parquet("../Data/CV_val_split.parquet")
assert((split_data.index==X_train.index).all()) # Sanity check to verify indices of X_train match up with indices of split_data
kfold=PredefinedKFold(split_data)

In [11]:
X_train_proc_wo_cat = X_train_proc.drop(columns=cat_cols)
X_val_proc_wo_cat = X_val_proc.drop(columns=cat_cols)

In [12]:
# Define models
log_regs = {
    'lr1': LogisticRegression(penalty='l2', solver='newton-cg', class_weight={0:1, 1:250}, random_state=42, C=0.10),
    'lr2': LogisticRegression(penalty=None, solver='lbfgs', class_weight={0:1, 1:400}, random_state=42),
}

# SHAP feature counts to drop
n_values = [10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]

# Cross-validation config
kfold = PredefinedKFold(split_data)
recall_macro_scorer = make_scorer(recall_score, average='macro')

# To store results
results = []

# Loop over models
for model_name, base_model in log_regs.items():

    # Train and evaluate on full feature set
    model = base_model
    model.fit(X_train_proc_wo_cat, y_train)
    y_pred = model.predict(X_val_proc_wo_cat)

    # Cross-validated recall
    cv_scores = cross_val_score(model, X_train_proc_wo_cat, y_train, cv=kfold, scoring=recall_macro_scorer)

    results.append({
        'model': model_name,
        'reduced': False,
        'n_dropped': 0,
        'parameters': model.get_params(),
        'accuracy': accuracy_score(y_val, y_pred),
        'macro_recall': recall_score(y_val, y_pred, average='macro'),
        'pr_auc': average_precision_score(y_val, y_pred),
        'confusion_matrix': confusion_matrix(y_val, y_pred).tolist(),
        'cv_macro_recall_mean': cv_scores.mean(),
        'cv_macro_recall_per_fold': cv_scores.tolist()
    })

    # SHAP values for original model
    explainer = shap.Explainer(model, X_train_proc_wo_cat)
    shap_values = explainer(X_train_proc_wo_cat)
    feature_importance = np.abs(shap_values.values).mean(axis=0)
    importance_df = pd.DataFrame({
        'feature': X_train_proc_wo_cat.columns,
        'importance': feature_importance
    }).sort_values(by='importance', ascending=True)  # Least important first

    # Loop over SHAP thresholds (features to drop)
    for n in n_values:
        low_shap = importance_df['feature'].head(n).tolist()
        X_train_reduced = X_train_proc_wo_cat.drop(columns=low_shap)
        X_val_reduced = X_val_proc_wo_cat.drop(columns=low_shap)

        model = base_model.__class__(**base_model.get_params())
        model.fit(X_train_reduced, y_train)
        y_pred = model.predict(X_val_reduced)

        cv_scores = cross_val_score(model, X_train_reduced, y_train, cv=kfold, scoring=recall_macro_scorer)

        results.append({
            'model': model_name,
            'reduced': True,
            'n_dropped': n,
            'parameters': model.get_params(),
            'accuracy': accuracy_score(y_val, y_pred),
            'macro_recall': recall_score(y_val, y_pred, average='macro'),
            'pr_auc': average_precision_score(y_val, y_pred),
            'confusion_matrix': confusion_matrix(y_val, y_pred).tolist(),
            'cv_macro_recall_mean': cv_scores.mean(),
            'cv_macro_recall_per_fold': cv_scores.tolist()
        })

# Convert to DataFrame
results_wo_cat_df = pd.DataFrame(results)


# Display summary
#display(results_df[['model', 'reduced', 'n_dropped', 'accuracy', 'macro_recall', 'cv_macro_recall_mean', 'pr_auc']])


In [13]:
results_wo_cat_df

Unnamed: 0,model,reduced,n_dropped,parameters,accuracy,macro_recall,pr_auc,confusion_matrix,cv_macro_recall_mean,cv_macro_recall_per_fold
0,lr1,False,0,"{'C': 0.1, 'class_weight': {0: 1, 1: 250}, 'du...",0.79457,0.730046,0.016713,"[[19091, 4909], [55, 109]]",0.631155,"[0.6784022453068148, 0.6104175119455135, 0.604..."
1,lr1,True,10,"{'C': 0.1, 'class_weight': {0: 1, 1: 250}, 'du...",0.794446,0.729984,0.016705,"[[19088, 4912], [55, 109]]",0.63387,"[0.6817839300263399, 0.614015412633948, 0.6058..."
2,lr1,True,15,"{'C': 0.1, 'class_weight': {0: 1, 1: 250}, 'du...",0.794405,0.729963,0.016702,"[[19087, 4913], [55, 109]]",0.632491,"[0.6798196440629255, 0.612132755064123, 0.6055..."
3,lr1,True,20,"{'C': 0.1, 'class_weight': {0: 1, 1: 250}, 'du...",0.794488,0.730005,0.016707,"[[19089, 4911], [55, 109]]",0.63219,"[0.6798084830995111, 0.6113738095519464, 0.605..."
4,lr1,True,25,"{'C': 0.1, 'class_weight': {0: 1, 1: 250}, 'du...",0.794695,0.730109,0.016722,"[[19094, 4906], [55, 109]]",0.631919,"[0.677788392319025, 0.6110166587226868, 0.6069..."
5,lr1,True,30,"{'C': 0.1, 'class_weight': {0: 1, 1: 250}, 'du...",0.794653,0.730088,0.016719,"[[19093, 4907], [55, 109]]",0.633029,"[0.6791388252946494, 0.6171110699687362, 0.602..."
6,lr1,True,35,"{'C': 0.1, 'class_weight': {0: 1, 1: 250}, 'du...",0.794115,0.729817,0.016682,"[[19080, 4920], [55, 109]]",0.634537,"[0.684183449965401, 0.6186700595995447, 0.6007..."
7,lr1,True,40,"{'C': 0.1, 'class_weight': {0: 1, 1: 250}, 'du...",0.79457,0.730046,0.016713,"[[19091, 4909], [55, 109]]",0.63512,"[0.6821633591849148, 0.6183240697336994, 0.604..."
8,lr1,True,45,"{'C': 0.1, 'class_weight': {0: 1, 1: 250}, 'du...",0.795026,0.730275,0.016745,"[[19102, 4898], [55, 109]]",0.636938,"[0.6879446074410143, 0.6163856073468026, 0.606..."
9,lr1,True,50,"{'C': 0.1, 'class_weight': {0: 1, 1: 250}, 'du...",0.795067,0.730296,0.016748,"[[19103, 4897], [55, 109]]",0.636037,"[0.6877660320263845, 0.6122296584876501, 0.608..."


In [14]:
# Define models
log_regs = {
    'lr3': LogisticRegression(penalty='l1', solver='liblinear', class_weight={0:1, 1:250}, random_state=42, C=1),
    'lr4': LogisticRegression(penalty='l2', solver='lbfgs', class_weight={0:1, 1:400}, random_state=42, C=1)
}


# SHAP feature counts to drop
n_values = [10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80]

# Cross-validation config
kfold = PredefinedKFold(split_data)
recall_macro_scorer = make_scorer(recall_score, average='macro')

# To store results
results = []

# Loop over models
for model_name, base_model in log_regs.items():
  

    # Train and evaluate on full feature set
    model = base_model
    model.fit(X_train_proc, y_train)
    y_pred = model.predict(X_val_proc)

    # Cross-validated recall
    cv_scores = cross_val_score(model, X_train_proc, y_train, cv=kfold, scoring=recall_macro_scorer)

    results.append({
        'model': model_name,
        'reduced': False,
        'n_dropped': 0,
        'parameters': model.get_params(),
        'accuracy': accuracy_score(y_val, y_pred),
        'macro_recall': recall_score(y_val, y_pred, average='macro'),
        'pr_auc': average_precision_score(y_val, y_pred),
        'confusion_matrix': confusion_matrix(y_val, y_pred).tolist(),
        'cv_macro_recall_mean': cv_scores.mean(),
        'cv_macro_recall_per_fold': cv_scores.tolist()
    })

    # SHAP values for original model
    explainer = shap.Explainer(model, X_train_proc)
    shap_values = explainer(X_train_proc)
    feature_importance = np.abs(shap_values.values).mean(axis=0)
    importance_df = pd.DataFrame({
        'feature': X_train_proc.columns,
        'importance': feature_importance
    }).sort_values(by='importance', ascending=True)  # Least important first

    # Loop over SHAP thresholds (features to drop)
    for n in n_values:
        low_shap = importance_df['feature'].head(n).tolist()
        X_train_reduced = X_train_proc.drop(columns=low_shap)
        X_val_reduced = X_val_proc.drop(columns=low_shap)

        model = base_model.__class__(**base_model.get_params())
        model.fit(X_train_reduced, y_train)
        y_pred = model.predict(X_val_reduced)

        cv_scores = cross_val_score(model, X_train_reduced, y_train, cv=kfold, scoring=recall_macro_scorer)

        results.append({
            'model': model_name,
            'reduced': True,
            'n_dropped': n,
            'parameters': model.get_params(),
            'accuracy': accuracy_score(y_val, y_pred),
            'macro_recall': recall_score(y_val, y_pred, average='macro'),
            'pr_auc': average_precision_score(y_val, y_pred),
            'confusion_matrix': confusion_matrix(y_val, y_pred).tolist(),
            'cv_macro_recall_mean': cv_scores.mean(),
            'cv_macro_recall_per_fold': cv_scores.tolist()
        })

# Convert to DataFrame
results_w_cat_df = pd.DataFrame(results)


# Display summary
#display(results_df[['model', 'reduced', 'n_dropped', 'accuracy', 'macro_recall', 'cv_macro_recall_mean', 'pr_auc']])


      


In [15]:
results_w_cat_df

Unnamed: 0,model,reduced,n_dropped,parameters,accuracy,macro_recall,pr_auc,confusion_matrix,cv_macro_recall_mean,cv_macro_recall_per_fold
0,lr3,False,0,"{'C': 1, 'class_weight': {0: 1, 1: 250}, 'dual...",0.82027,0.755096,0.019786,"[[19708, 4292], [51, 113]]",0.627076,"[0.676996312696712, 0.6139585573732607, 0.5902..."
1,lr3,True,10,"{'C': 1, 'class_weight': {0: 1, 1: 250}, 'dual...",0.820187,0.755054,0.019778,"[[19706, 4294], [51, 113]]",0.626474,"[0.6761369185138061, 0.615696122418699, 0.5875..."
2,lr3,True,15,"{'C': 1, 'class_weight': {0: 1, 1: 250}, 'dual...",0.82027,0.755096,0.019786,"[[19708, 4292], [51, 113]]",0.629622,"[0.6856681632821044, 0.6156514785650415, 0.587..."
3,lr3,True,20,"{'C': 1, 'class_weight': {0: 1, 1: 250}, 'dual...",0.82027,0.755096,0.019786,"[[19708, 4292], [51, 113]]",0.631523,"[0.6911815484162593, 0.6156626395284559, 0.587..."
4,lr3,True,25,"{'C': 1, 'class_weight': {0: 1, 1: 250}, 'dual...",0.82027,0.755096,0.019786,"[[19708, 4292], [51, 113]]",0.630152,"[0.6871302058918726, 0.6135902455805867, 0.589..."
5,lr3,True,30,"{'C': 1, 'class_weight': {0: 1, 1: 250}, 'dual...",0.820311,0.755116,0.01979,"[[19709, 4291], [51, 113]]",0.629949,"[0.6908132366235853, 0.6132554166781559, 0.585..."
6,lr3,True,35,"{'C': 1, 'class_weight': {0: 1, 1: 250}, 'dual...",0.819566,0.754741,0.019718,"[[19691, 4309], [51, 113]]",0.630869,"[0.6924426936845689, 0.6131103241537692, 0.587..."
7,lr3,True,40,"{'C': 1, 'class_weight': {0: 1, 1: 250}, 'dual...",0.819856,0.754887,0.019746,"[[19698, 4302], [51, 113]]",0.62996,"[0.6885252827211545, 0.6143456458455612, 0.587..."
8,lr3,True,45,"{'C': 1, 'class_weight': {0: 1, 1: 250}, 'dual...",0.820063,0.754991,0.019766,"[[19703, 4297], [51, 113]]",0.629903,"[0.6903667980870108, 0.6143679677723899, 0.584..."
9,lr3,True,50,"{'C': 1, 'class_weight': {0: 1, 1: 250}, 'dual...",0.819856,0.754887,0.019746,"[[19698, 4302], [51, 113]]",0.631021,"[0.6896859793187349, 0.6184234679608132, 0.584..."
