In [None]:
import pandas as pd
import numpy as np

In [None]:
train_df = pd.read_parquet("../train_final_v3.parquet")
val_df = pd.read_parquet("../validationA_v3.parquet")

In [None]:
print(train_df.shape)
print(val_df.shape)

In [None]:
train_df.iloc[:,:25].dtypes

In [None]:
X_train = train_df.drop(columns=['match'], axis=1)
X_val = val_df.drop(columns=['match'], axis=1)
y_train = train_df['match']
y_val = val_df['match']

In [None]:
X_train_raw = X_train
X_val_raw = X_val

## Columns Type Updated

In [None]:
for df in (X_train, X_val):
    df['review_span'] = (df['max_date'] - df['min_date']).dt.days
    df.drop(['min_date', 'max_date'], axis=1, inplace=True)

for df in (X_train, X_val):
    df['missing_price'] = df['missing_price'].astype(int)

for df in (X_train, X_val):
    df['product_lifespan_days'] = df['product_lifespan'].dt.days
    df.drop('product_lifespan', axis=1, inplace=True)

print(f"The shape of X_train: {X_train.shape}, X_val: {X_val.shape}")

## Correlation Analysis

In [None]:
import numpy as np, seaborn as sns, matplotlib.pyplot as plt
corr = X_train.drop(columns=['category']).corr()
mask = np.triu(np.ones(corr.shape, dtype=bool), k=1)
high_corr_pairs = (corr.where(mask).stack().reset_index(name='correlation')
                   .query('abs(correlation) > 0.9')
                   .rename(columns={'level_0':'Feature1','level_1':'Feature2'}))
print(high_corr_pairs)
top = (high_corr_pairs.assign(AbsCorr=lambda df: df['correlation'].abs())
       .nlargest(50,'AbsCorr')
       .pivot(index='Feature1', columns='Feature2', values='AbsCorr'))
plt.figure(figsize=(6,5))
ax = sns.heatmap(top, annot=True, cmap='coolwarm', fmt=".2f", cbar_kws={'label':'Abs Pearson ρ'})
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, va='center')
plt.tight_layout(); plt.show()

We will drop the columns `percent_positive`, `percent_negative`,  `review_span ` and `unique_reviewer_count` as they exhibit high correlation with other features, which may introduce multicollinearity into the model.

In [None]:
for df in (X_train, X_val):
    df.drop(['percent_positive', 'percent_negative', 'unique_reviewer_count', 'review_span'], axis=1, inplace=True)

## Preprocessing, PCA

In [None]:
from typing import List, Tuple
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA



def make_transformer(df,r, s, drop_first=True):
    rev_cols  = [c for c in df if c.startswith("embedding_")]
    summ_cols = [c for c in df if c.startswith("embed_")]
    numeric_cols  = [c for c in df if c not in rev_cols+summ_cols+["category"]]

    rev_pipe  = ("drop" if r == 0 else Pipeline([("scale",StandardScaler()), ("pca",PCA(n_components=r,random_state=42))]))
    sum_pipe  = ("drop" if s == 0 else Pipeline([("scale",StandardScaler()), ("pca",PCA(n_components=s,random_state=42))]))

    return ColumnTransformer(
        [('num', StandardScaler(), numeric_cols),
         ('cat', OneHotEncoder(handle_unknown="ignore", drop="first" if drop_first else None, sparse_output=False), ["category"]),
         ('rev', rev_pipe,  rev_cols),
         ('sum', sum_pipe,  summ_cols)
        ]).set_output(transform="pandas")

In [None]:
preprocessor = make_transformer(X_train, 0.95, 0.95)
X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc = preprocessor.transform(X_val)

In [None]:
# Column Types
num_cols = [c for c in X_train_proc.columns if c.startswith("num__")]
cat_cols       = [c for c in X_train_proc.columns if c.startswith("cat__")]
rev_cols       = [c for c in X_train_proc.columns if c.startswith("rev__")]
sum_cols       = [c for c in X_train_proc.columns if c.startswith("sum__")]

## Multicollinearity: Variance Inflation Factor (VIF) On Non-Embedding Features:

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def compute_vif(X_train_proc, numeric_cols, categorical_cols, thresh=10):
    nonembed_cols = numeric_cols + categorical_cols
    X_vif = X_train_proc[nonembed_cols].copy()

    # drop constant / near-constant columns
    const_cols = X_vif.columns[X_vif.std() < 1e-12]
    if len(const_cols):
        X_vif.drop(columns=const_cols, inplace=True)

    # VIF calculation 
    vif_data = pd.DataFrame()
    vif_data["feature"] = X_vif.columns
    vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
    # Print features with VIF > 10
    high_vif = vif_data[vif_data["VIF"] > thresh].sort_values("VIF", ascending=False)
    if not high_vif.empty:
        print(f"Features with VIF > {thresh}:")
        print(high_vif.to_string(index=False))
    else:
        print(f"No features with VIF > {thresh}")

    return vif_data

In [None]:
vif_data = compute_vif(X_train_proc, num_cols, cat_cols, thresh=10)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score

log_reg = LogisticRegression(max_iter=1000, penalty='l1', solver='liblinear', class_weight="balanced", random_state=42)
log_reg.fit(X_train_proc, y_train)

y_pred = log_reg.predict(X_val_proc)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

recall_macro = recall_score(y_val, y_pred, average='macro')
print("Macro Recall:", recall_macro)

In [None]:
import sys
from pathlib import Path

project_root = Path().resolve().parent   
sys.path.insert(0, str(project_root / "src"))

## Custom KFold

In [None]:
import sys
sys.path.insert(0, '../Data/')
from get_cv_split import PredefinedKFold
split_data=pd.read_parquet("../Data/CV_val_split.parquet")
assert((split_data.index==X_train.index).all()) # Sanity check to verify indices of X_train match up with indices of split_data
kfold=PredefinedKFold(split_data)

In [None]:
from sklearn.metrics import make_scorer, recall_score
from sklearn.model_selection import cross_val_score

recall_macro_scorer = make_scorer(recall_score, average='macro')

scores = cross_val_score(log_reg, X_train_proc, y_train, cv=kfold, scoring=recall_macro_scorer)

print("Macro Recall (per fold):", scores)
print("Mean Macro Recall:", scores.mean())


### GridSearchCV on r and s (The PCA dimension of Review and Summary Embeddings)

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, average_precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold

pipe = Pipeline([
    ("fe", make_transformer(X_train, r=50, s=10)),  # Dummy values, will be overridden by grid search
    ("logreg", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42))
])

grid_vals = [10, 20, 50, 100]
param_grid = {
    "fe__rev__pca__n_components": grid_vals,  # r
    "fe__sum__pca__n_components": grid_vals   # s
}

scorers = {
    "pr_auc": make_scorer(average_precision_score, needs_proba=True),
    "recall_macro": make_scorer(recall_score, average="macro"),
    "f1_macro": make_scorer(f1_score, average="macro"),
}

# Run GridSearchCV
gcv = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring=scorers,
    refit="recall_macro",  
    cv=kfold,              
    n_jobs=20,
    return_train_score=False,
    verbose=1
)

# Fit
gcv.fit(X_train, y_train)  

# Extract results
results = (
    pd.DataFrame(gcv.cv_results_)
    .loc[:, [
        "param_fe__rev__pca__n_components",
        "param_fe__sum__pca__n_components",
        "mean_test_pr_auc", "std_test_pr_auc",
        "mean_test_recall_macro", "std_test_recall_macro",
        "mean_test_f1_macro", "std_test_f1_macro"
    ]]
    .rename(columns={
        "param_fe__rev__pca__n_components": "r",
        "param_fe__sum__pca__n_components": "s",
        "mean_test_pr_auc": "pr_auc_mean",
        "std_test_pr_auc": "pr_auc_std",
        "mean_test_recall_macro": "recall_macro_mean",
        "std_test_recall_macro": "recall_macro_std",
        "mean_test_f1_macro": "f1_macro_mean",
        "std_test_f1_macro": "f1_macro_std"
    })
    .sort_values("recall_macro_mean", ascending=False)
    .reset_index(drop=True)
)

print(results.head())

# Best model ready to use
best_lr = gcv.best_estimator_


From the results, we observe that there is not much difference between the different values of r and s. So, we will choose r = 50 and s = 10 since with these values model performs best.

## Feature Selection With Logistic Regression

#### Preprocessing with r=50, s=10

In [None]:
preprocessor = make_transformer(X_train, r=50, s=10)
X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc = preprocessor.transform(X_val)
# update columns names
num_cols = [c for c in X_train_proc.columns if c.startswith("num__")]
cat_cols       = [c for c in X_train_proc.columns if c.startswith("cat__")]
rev_cols       = [c for c in X_train_proc.columns if c.startswith("rev__")]
sum_cols       = [c for c in X_train_proc.columns if c.startswith("sum__")]

print(f"After preprocessing X_train_proc:", X_train_proc.shape)
print(f"After preprocessing X_val_proc:", X_val_proc.shape)

In [None]:
from sklearn.feature_selection import SelectFromModel

l1_model = LogisticRegression(penalty='l1', solver='saga', class_weight='balanced', C=0.1, random_state=42)
l1_model.fit(X_train_proc, y_train)

selector = SelectFromModel(l1_model, prefit=True)
X_train_sel = selector.transform(X_train_proc)
selected_feats = X_train_proc.columns[selector.get_support()]

print(f"Selected {len(selected_feats)} features:")
print(selected_feats.tolist())


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Get coefficients and feature names
coefs = l1_model.coef_[0]
feature_names = X_train_proc.columns

# Create a DataFrame
coef_df = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefs,
    'abs_coefficient': np.abs(coefs)
})

# Get top 20 features by absolute coefficient
top20 = coef_df.sort_values(by='abs_coefficient', ascending=False).tail(20)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(top20['feature'][::-1], top20['coefficient'][::-1])
plt.title('Top 20 L1 Logistic Regression Coefficients')
plt.xlabel('Coefficient Value')
plt.grid(True, axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

It looks like category features are very important. We will drop category features and regenerate the featue importance.

In [None]:
# Drop category columns
X_train_proc_wo_cat = X_train_proc.drop(columns=cat_cols, axis=1)
X_val_proc_wo_cat = X_val_proc.drop(columns=cat_cols, axis=1)

print(f"After dropping categorical columns from train dataframe:", X_train_proc_wo_cat.shape)
print(f"After dropping categorical columns from val dataframe:", X_val_proc_wo_cat.shape)

In [None]:
l1_model.fit(X_train_proc_wo_cat, y_train)

selector = SelectFromModel(l1_model, prefit=True)
X_train_sel = selector.transform(X_train_proc_wo_cat)
selected_feats = X_train_proc_wo_cat.columns[selector.get_support()]

print(f"Selected {len(selected_feats)} features:")
print(selected_feats.tolist())


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Get coefficients and feature names
coefs = l1_model.coef_[0]
feature_names = X_val_proc_wo_cat.columns

# Create a DataFrame
coef_df = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefs,
    'abs_coefficient': np.abs(coefs)
})

# Get top 20 features by absolute coefficient
top20 = coef_df.sort_values(by='abs_coefficient', ascending=False).head(20)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(top20['feature'][::-1], top20['coefficient'][::-1])
plt.title('Top 20 L1 Logistic Regression Coefficients')
plt.xlabel('Coefficient Value')
plt.grid(True, axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import make_scorer, recall_score
from sklearn.model_selection import cross_val_score

recall_macro_scorer = make_scorer(recall_score, average='macro')

scores = cross_val_score(log_reg, X_train_proc_wo_cat, y_train, cv=kfold, scoring=recall_macro_scorer)

print("Macro Recall (per fold):", scores)
print("Mean Macro Recall:", scores.mean())

In [None]:
log_reg = LogisticRegression(max_iter=1000, penalty='l1', solver='liblinear', class_weight="balanced", random_state=42)
log_reg.fit(X_train_proc_wo_cat, y_train)

y_pred = log_reg.predict(X_val_proc_wo_cat)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

recall_macro = recall_score(y_val, y_pred, average='macro')
print("Macro Recall:", recall_macro)

Dropping category improves the results. 

Rebekah Testing

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
import matplotlib.pyplot as plt

X_train_cat_chi = X_train_proc[cat_cols]
X_val_cat_chi = X_val_proc[cat_cols]

chi_selector = SelectKBest(score_func=chi2, k='all')
chi_selector.fit(X_train_cat_chi, y_train)

chi_scores = pd.Series(chi_selector.scores_, index=X_train_cat_chi.columns).sort_values(ascending=False)

# Keep top k categorical features
top_k_cat = chi_scores.index.tolist()
X_cat_reduced = X_train_cat_chi[top_k_cat]

chi_scores.sort_values(ascending=True).plot(kind='barh', figsize=(8, 6), title='Chi-Squared Scores by Category')
plt.xlabel("Chi-squared Score")
plt.tight_layout()
plt.show()


In [None]:
chi_scores_features = chi_scores[chi_scores > 10].index.to_list()

In [None]:
chi_scores_features

In [None]:
from sklearn.feature_selection import mutual_info_classif

mi = mutual_info_classif(X_train_proc, y_train, discrete_features=[False]*76 + [True]*len(cat_cols))

mi_scores = pd.Series(mi, index=X_train_proc.columns).sort_values(ascending=False)

# Select top K features overall
top_k_mi = mi_scores.index.tolist()
X_selected = X_train_proc[top_k_mi]


In [None]:
top_n = 50
mi_top = mi_scores.sort_values(ascending=False).tail(top_n)

plt.figure(figsize=(10, 12))  # wider and taller
mi_top.sort_values().plot(kind='barh', color='skyblue')  # horizontal bar chart
plt.xlabel("Mutual Information Score")
plt.ylabel('Feature')
plt.title("Bottom 50 Mutual Information Scores")
plt.tight_layout()
plt.show()

In [None]:
mi_score_features = mi_scores[mi_scores > 0.013].index.to_list()

In [None]:
final_features = list(set(chi_scores_features).union(set(mi_score_features)))
X_fs_train_final = X_train_proc[final_features]


In [None]:
X_fs_val_final = X_val_proc[final_features]

In [None]:
X_train_wo_cat = X_train_proc.drop(columns=cat_cols)
X_val_wo_cat = X_val_proc.drop(columns=cat_cols)

In [None]:
log_reg_wo_cat = LogisticRegression(penalty='l1', solver='saga', class_weight={0:1, 1:600}, random_state=42, C=0.10)
log_reg_wo_cat.fit(X_train_proc_wo_cat, y_train)

y_pred = log_reg_wo_cat.predict(X_val_proc_wo_cat)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

recall_macro = recall_score(y_val, y_pred, average='macro')
print("Macro Recall:", recall_macro)

In [None]:
log_reg_w_cat = LogisticRegression(penalty='l1', solver='saga', class_weight={0:1, 1:600}, random_state=42, C=10)
log_reg_w_cat.fit(X_train_proc, y_train)

y_pred = log_reg_w_cat.predict(X_val_proc)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

recall_macro = recall_score(y_val, y_pred, average='macro')
print("Macro Recall:", recall_macro)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_wo_cat = RandomForestClassifier(n_estimators=400,
                                   min_samples_split=5,
                                   min_samples_leaf=3,
                                   max_features='log2',
                                   max_depth=7,
                                   class_weight={0: 1.0, 1: 600.0})

rf_wo_cat.fit(X_train_proc_wo_cat, y_train)

y_pred = rf_wo_cat.predict(X_val_proc_wo_cat)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

recall_macro = recall_score(y_val, y_pred, average='macro')
print("Macro Recall:", recall_macro)



In [None]:

rf_w_cat = RandomForestClassifier(n_estimators=300,
                                   min_samples_split=10,
                                   min_samples_leaf=3,
                                   max_features='log2',
                                   max_depth=7,
                                   class_weight={0: 1.0, 1: 600.0})

rf_w_cat.fit(X_train_proc, y_train)

y_pred = rf_w_cat.predict(X_val_proc)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

recall_macro = recall_score(y_val, y_pred, average='macro')
print("Macro Recall:", recall_macro)

In [None]:
from xgboost import XGBClassifier

xgb_wo_cat = XGBClassifier(
    max_depth = 3,
    learning_rate = 0.05,
    subsample = 0.8,
    colsample_bytree = 1,
    reg_alpha = 1,
    reg_lambda = 1,
    scale_pos_weight = 600
)

xgb_wo_cat.fit(X_train_proc_wo_cat, y_train)

y_pred = xgb_wo_cat.predict(X_val_proc_wo_cat)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

recall_macro = recall_score(y_val, y_pred, average='macro')
print("Macro Recall:", recall_macro)

In [None]:
xgb_w_cat = XGBClassifier(
    max_depth = 3,
    learning_rate = 0.05,
    subsample = 0.6,
    colsample_bytree = 0.8,
    reg_alpha = 0,
    reg_lambda = 1,
    scale_pos_weight = 600
)

xgb_w_cat.fit(X_train_proc, y_train)

y_pred = xgb_w_cat.predict(X_val_proc)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

recall_macro = recall_score(y_val, y_pred, average='macro')
print("Macro Recall:", recall_macro)

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
import matplotlib.pyplot as plt

X_train_cat_chi = X_train_proc[cat_cols]
X_val_cat_chi = X_val_proc[cat_cols]

chi_selector = SelectKBest(score_func=chi2, k='all')
chi_selector.fit(X_train_cat_chi, y_train)

chi_scores = pd.Series(chi_selector.scores_, index=X_train_cat_chi.columns).sort_values(ascending=True)

# Keep top k categorical features
top_k_cat = chi_scores.index.tolist()
X_cat_reduced = X_train_cat_chi[top_k_cat]

chi_scores.sort_values(ascending=True).plot(kind='barh', figsize=(8, 6), title='Chi-Squared Scores by Category')
plt.xlabel("Chi-squared Score")
plt.tight_layout()
plt.show()


In [None]:
low_chi_features = chi_scores[chi_scores < 10].index.to_list()

In [None]:
chi_scores_features

In [None]:
wo_cat = list(X_train_proc_wo_cat.columns)
high_chi_cat = wo_cat + chi_scores_features
X_train_high_chi = X_train_proc[high_chi_cat]
X_train_high_chi

In [None]:
wo_cat = list(X_train_proc_wo_cat.columns)
low_chi_cat = wo_cat + low_chi_features
X_train_low_chi = X_train_proc[low_chi_cat]
X_train_low_chi

In [None]:
from sklearn.metrics import average_precision_score
xgb_low_chi = XGBClassifier(
    max_depth = 3,
    learning_rate = 0.05,
    subsample = 0.8,
    colsample_bytree = 1,
    reg_alpha = 1,
    reg_lambda = 1,
    scale_pos_weight = 600
)

xgb_low_chi.fit(X_train_low_chi, y_train)

y_pred = xgb_low_chi.predict(X_val_proc[low_chi_cat])
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

recall_macro = recall_score(y_val, y_pred, average='macro')
print("Macro Recall:", recall_macro)

print(f'PR AUC:{average_precision_score(y_val,y_pred)}')


In [None]:
xgb_high_chi = XGBClassifier(
    max_depth = 3,
    learning_rate = 0.05,
    subsample = 0.8,
    colsample_bytree = 1,
    reg_alpha = 1,
    reg_lambda = 1,
    scale_pos_weight = 600
)

xgb_high_chi.fit(X_train_high_chi, y_train)

y_pred = xgb_high_chi.predict(X_val_proc[high_chi_cat])
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

recall_macro = recall_score(y_val, y_pred, average='macro')
print("Macro Recall:", recall_macro)

print(f'PR AUC:{average_precision_score(y_val,y_pred)}')


In [None]:
log_reg_wo_cat = LogisticRegression(penalty='l1', solver='saga', class_weight={0:1, 1:600}, random_state=42, C=0.10)
log_reg_wo_cat.fit(X_train_high_chi, y_train)

y_pred = log_reg_wo_cat.predict(X_val_proc[high_chi_cat])
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

recall_macro = recall_score(y_val, y_pred, average='macro')
print("Macro Recall:", recall_macro)

print(f'PR AUC:{average_precision_score(y_val,y_pred)}')

In [None]:
log_reg_wo_cat = LogisticRegression(penalty='l1', solver='saga', class_weight={0:1, 1:600}, random_state=42, C=0.10)
log_reg_wo_cat.fit(X_train_low_chi, y_train)

y_pred = log_reg_wo_cat.predict(X_val_proc[low_chi_cat])
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

recall_macro = recall_score(y_val, y_pred, average='macro')
print("Macro Recall:", recall_macro)

print(f'PR AUC:{average_precision_score(y_val,y_pred)}')

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import make_scorer, recall_score
from sklearn.model_selection import cross_val_score

xgb_w_cat = XGBClassifier(
    max_depth = 3,
    learning_rate = 0.05,
    subsample = 0.6,
    colsample_bytree = 0.8,
    reg_alpha = 0,
    reg_lambda = 1,
    scale_pos_weight = 600
)

rf_w_cat = RandomForestClassifier(n_estimators=300,
                                   min_samples_split=10,
                                   min_samples_leaf=3,
                                   max_features='log2',
                                   max_depth=7,
                                   class_weight={0: 1.0, 1: 600.0})

voting_w_cat_hard = VotingClassifier(estimators=[
    ('xgb', xgb_w_cat), ('rf', rf_w_cat)], voting='hard'
)

voting_w_cat_hard.fit(X_train_proc, y_train)

recall_macro_scorer = make_scorer(recall_score, average='macro')

scores = cross_val_score(voting_w_cat_hard, X_train_proc, y_train, cv=kfold, scoring=recall_macro_scorer)

print("Macro Recall (per fold):", scores)
print("Mean Macro Recall:", scores.mean())

y_pred = voting_w_cat_hard.predict(X_val_proc)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

recall_macro = recall_score(y_val, y_pred, average='macro')
print("Macro Recall:", recall_macro)

print(f'PR AUC:{average_precision_score(y_val,y_pred)}')

In [None]:
xgb_w_cat = XGBClassifier(
    max_depth = 3,
    learning_rate = 0.05,
    subsample = 0.6,
    colsample_bytree = 0.8,
    reg_alpha = 0,
    reg_lambda = 1,
    scale_pos_weight = 600
)

rf_w_cat = RandomForestClassifier(n_estimators=300,
                                   min_samples_split=10,
                                   min_samples_leaf=3,
                                   max_features='log2',
                                   max_depth=7,
                                   class_weight={0: 1.0, 1: 600.0})

voting_w_cat_soft = VotingClassifier(estimators=[
    ('xgb', xgb_w_cat), ('rf', rf_w_cat)], voting='soft'
)


voting_w_cat_soft.fit(X_train_proc, y_train)

recall_macro_scorer = make_scorer(recall_score, average='macro')

scores = cross_val_score(voting_w_cat_soft, X_train_proc, y_train, cv=kfold, scoring=recall_macro_scorer)

print("Macro Recall (per fold):", scores)
print("Mean Macro Recall:", scores.mean())

y_pred = voting_w_cat_soft.predict(X_val_proc)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

recall_macro = recall_score(y_val, y_pred, average='macro')
print("Macro Recall:", recall_macro)

print(f'PR AUC:{average_precision_score(y_val,y_pred)}')

In [None]:
xgb_wo_cat = XGBClassifier(
    max_depth = 3,
    learning_rate = 0.05,
    subsample = 0.8,
    colsample_bytree = 1,
    reg_alpha = 1,
    reg_lambda = 1,
    scale_pos_weight = 600
)

xgb_wo_cat.fit(X_train_proc_wo_cat, y_train)

rf_wo_cat = RandomForestClassifier(n_estimators=400,
                                   min_samples_split=5,
                                   min_samples_leaf=3,
                                   max_features='log2',
                                   max_depth=7,
                                   class_weight={0: 1.0, 1: 600.0})

rf_wo_cat.fit(X_train_proc_wo_cat, y_train)

voting_wo_cat_soft = VotingClassifier(estimators=[
    ('xgb', xgb_wo_cat), ('rf', rf_wo_cat)], voting='soft'
)

voting_wo_cat_soft.fit(X_train_proc, y_train)

recall_macro_scorer = make_scorer(recall_score, average='macro')

scores = cross_val_score(voting_wo_cat_soft, X_train_proc, y_train, cv=kfold, scoring=recall_macro_scorer)

print("Macro Recall (per fold):", scores)
print("Mean Macro Recall:", scores.mean())

y_pred = voting_wo_cat_soft.predict(X_val_proc)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

recall_macro = recall_score(y_val, y_pred, average='macro')
print("Macro Recall:", recall_macro)

print(f'PR AUC:{average_precision_score(y_val,y_pred)}')

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, recall_score, f1_score, precision_score
import pandas as pd

# Scoring metrics
scorers = {
    'recall_macro': make_scorer(recall_score, average='macro'),
    'f1_macro': make_scorer(f1_score, average='macro'),
    'precision_macro': make_scorer(precision_score, average='macro')
}
n_jobs = 24
rf_defaults = dict(n_estimators=300, random_state=42, n_jobs=n_jobs)

param_grid = {
    'n_estimators': [300, 400],
    'class_weight': ["balanced", {0: 1.0, 1: 600.0}],
    'max_depth': [5, 7, 10],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [3, 5],
    'max_features': ["sqrt", "log2"]
}

def run_rf_gridsearch(X, y, kfold, label):
    print(f"\n Running GridSearchCV for: {label}")
    rf = RandomForestClassifier(**rf_defaults)

    gcv = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        scoring=scorers,
        refit="recall_macro",
        cv=kfold,
        n_jobs=n_jobs,
        verbose=1,
        return_train_score=False
    )

    gcv.fit(X, y)

    print(f" Best params: {gcv.best_params_}")
    print(f" Best recall_macro: {gcv.best_score_:.4f}")
    return gcv.best_estimator_, gcv


In [None]:
# Without categorical columns
best_rf_wo, gcv_wo = run_rf_gridsearch(X_train_proc_wo_cat, y_train, kfold, "WITHOUT cat_cols")

In [None]:
# With categorical columns
best_rf_with, gcv_with = run_rf_gridsearch(X_train_proc, y_train, kfold, "WITH cat_cols")

## XGBoost

In [None]:
# Smote
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_smote_wo_cat, y_smote = sm.fit_resample(X_train_proc_wo_cat, y_train)
X_smote, y_smote = sm.fit_resample(X_train_proc, y_train)
print("After SMOTE without categorical columns:", X_smote_wo_cat.shape, y_smote.value_counts().to_dict())
print("After SMOTE all columns:", X_smote.shape, y_smote.value_counts().to_dict())

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, recall_score, average_precision_score, f1_score
import pandas as pd

def run_xgb_gridsearch(X, y, cv, verbose=2, n_jobs=20):
    xgb_base = XGBClassifier(
        n_estimators=300,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42,
        n_jobs=n_jobs
    )

    param_grid = {
        'max_depth': [3, 4, 5, 6],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'reg_alpha': [0, 0.5, 1.0],
        'reg_lambda': [0.5, 1.0, 2.0],
        'scale_pos_weight': [1.0, 600.0]
    }

    scorers = {
        'recall_macro': make_scorer(recall_score, average='macro'),
        'f1_macro': make_scorer(f1_score, average='macro'),
        'pr_auc': make_scorer(average_precision_score, needs_proba=True)
    }

    gcv = GridSearchCV(
        estimator=xgb_base,
        param_grid=param_grid,
        scoring=scorers,
        refit='recall_macro',
        cv=cv,
        n_jobs=n_jobs,
        verbose=verbose,
        return_train_score=False
    )

    gcv.fit(X, y)

    results = pd.DataFrame(gcv.cv_results_).sort_values("mean_test_recall_macro", ascending=False)

    top_cols = [
        'param_max_depth', 'param_learning_rate', 'param_subsample', 'param_colsample_bytree',
        'param_reg_alpha', 'param_reg_lambda', 'param_scale_pos_weight',
        'mean_test_recall_macro', 'mean_test_f1_macro', 'mean_test_pr_auc'
    ]

    print(results[top_cols].head(10))
    return gcv, results[top_cols]


In [None]:
# With full features
gcv_full, results_full = run_xgb_gridsearch(X_smote, y_smote, cv=kfold)

In [None]:
# Without categorical features
gcv_wo_cat, results_wo_cat = run_xgb_gridsearch(X_smote_wo_cat, y_smote, cv=kfold)