In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Load Python Pakages

In [None]:
#basics
import numpy as np
import pandas as pd 
import seaborn as sns
import time
import matplotlib.pyplot as plt
import missingno as msno
from sklearn.base import clone

import warnings
warnings.filterwarnings("ignore")

#preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import QuantileTransformer, quantile_transform

#statistics
from scipy.stats import randint, mode

#feature engineering
from sklearn.feature_selection import mutual_info_classif

#transformers and pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn import set_config

#feature engineering
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFECV

#algorithms
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier
from lightgbm.callback import early_stopping, log_evaluation
from sklearn.linear_model import LogisticRegression

#model evaluation
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, log_loss, auc, accuracy_score, balanced_accuracy_score
from sklearn.metrics import make_scorer, RocCurveDisplay, confusion_matrix

#model evaluation
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, confusion_matrix, matthews_corrcoef, make_scorer

# Optuna and visualization tools
import optuna
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice

random_state = 42

### Lets look into data

In [None]:
# Read the data
train_df = pd.read_csv('/kaggle/input/playground-series-s4e8/train.csv', index_col=[0])
test_df = pd.read_csv('/kaggle/input/playground-series-s4e8/test.csv', index_col=[0])

train_df.head()

### Check for missing values

In [None]:
%matplotlib inline
msno.matrix(train_df)
plt.show()

In [None]:
missing = pd.DataFrame(train_df.isnull().sum().sort_values(ascending=False))
missing.columns = ["missing_count"]
#missing = missing.loc[(missing!=0).any(axis=1)]
#missing["missing_percent"] = missing[0:] / len(train_df)*100
missing["missing_percent"] = missing / len(train_df)*100
missing.style.background_gradient('viridis')

### Descriptive statistics

In [None]:
#numerical feature descriptive statistics

train_df.describe().T

In [None]:
#Target frequency

plt.figure(figsize=(10, 10))
palette_color = sns.color_palette('pastel')
explode = [0.02 for _ in range(train_df['class'].nunique())]

# Plotting
train_df.groupby('class')['class'].count().plot.pie(
    colors=palette_color,
    explode=explode,
    autopct="%1.1f%%",
    shadow=True,  # Adding shadow for better visibility
    startangle=140,  # Start angle for better alignment
    textprops={'fontsize': 14},  # Adjust text size
    wedgeprops={'edgecolor': 'black', 'linewidth': 1.5}  # Adding edge color and width
)

# Adding a title
plt.title('Class Distribution', fontsize=18, weight='bold')

# Equal aspect ratio ensures that pie is drawn as a circle.
plt.axis('equal')

# Displaying the plot
plt.show()

In [None]:
le = LabelEncoder()
train_df['class'] = le.fit_transform(train_df['class'])

### Grouping features for preprocessing purposes

In [None]:
train_df.nunique().sort_values()

In [None]:
train_df.info()

In [None]:
# Store numerical and categorical features to different lists for visualization purposes
feature_list = [feature for feature in train_df.columns if not feature  == "class"]

target = "class"

numerical_features = ['stem-height', 'cap-diameter', 'stem-width']

categorical_features = list(set(feature_list) - set(numerical_features))

assert feature_list.sort() == (numerical_features + categorical_features).sort()

In [None]:
eda_df = train_df.sample(frac= 0.1, random_state=random_state)

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(30, 10))
for var, subplot in zip(numerical_features, ax.flatten()):
    sns.boxplot(x='class', y=var, data=eda_df, ax=subplot, palette='Set3')

In [None]:
#check for cardinality
train_df[categorical_features].nunique().sort_values()

In [None]:
#Categoricals
#Just visualize moderate cardinality features
moderate_cardinality_features = ['season', 'veil-type', 'has-ring', 'veil-color']

fig, ax = plt.subplots(2, 2, figsize=(30, 30))
for var, subplot in zip(moderate_cardinality_features, ax.flatten()):
    sns.barplot(x=var,y= 'class',  data=eda_df, ax=subplot, palette='Set3')
    subplot.set_xticklabels(subplot.get_xticklabels(), rotation=45, ha='right')

##### Very strong features. Some categories are dedicated to one class. For example, if the vell-type is 'l,' it is non-poisonous. If 't', it is poisonous.

In [None]:
# Mutual Information score
y_sampled = eda_df['class']
mutual_df = eda_df[numerical_features]

mutual_info = mutual_info_classif(mutual_df.fillna(0), y_sampled, random_state=random_state)

mutual_info = pd.Series(mutual_info)
mutual_info.index = mutual_df.columns
mutual_info = pd.DataFrame(mutual_info.sort_values(ascending=False), columns = ["Numerical_Feature_MI"] )
mutual_info.style.background_gradient("cool")

In [None]:
mutual_df_categorical = eda_df[categorical_features]
#categorical features must be encoded to get mutual information
for colname in mutual_df_categorical:
    mutual_df_categorical[colname], _ = mutual_df_categorical[colname].factorize()
mutual_info = mutual_info_classif(mutual_df_categorical.fillna("Do_not_have_feature"), y_sampled, random_state=1)

mutual_info = pd.Series(mutual_info)
mutual_info.index = mutual_df_categorical.columns
pd.DataFrame(mutual_info.sort_values(ascending=False), columns = ["Categorical_Feature_MI"] ).style.background_gradient("cool")

In [None]:
#Pair-plot for most important features
sns.pairplot(eda_df[numerical_features + ["class"]], hue="class",  corner=True)

### Preprocessing

In [None]:
#Converts object types to category
#reference for why we would do that?: https://catboost.ai/en/docs/concepts/speed-up-training
train_df[categorical_features] = train_df[categorical_features].astype('category')
test_df[categorical_features] = test_df[categorical_features].astype('category')

In [None]:
#Catboost complains about missing value format
def preprocess_catboost(train_df, test_data, cat_features):
    for col in cat_features:
        train_df[col] = train_df[col].astype(str).fillna('NaN')
        test_data[col] = test_data[col].astype(str).fillna('NaN')
    return train_df, test_data

In [None]:
# For XGB
#Check this discussion to see how XGB fails to handle cat features:
#https://www.kaggle.com/competitions/playground-series-s4e8/discussion/523781#2945249
encoder  = ColumnTransformer(remainder='passthrough',
    transformers=[
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_features),
    ])

encoder

### Modeling

In [None]:
#seperate target
y = train_df['class']
train_df = train_df.drop(['class'], axis=1)

In [None]:
#CV strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

In [None]:
def cross_validate_score(model, train_df, y, cv, test_data):
    val_scores = []
    test_preds = np.zeros((test_data.shape[0],))
    oof_preds = np.zeros((train_df.shape[0],))

    if isinstance(model, CatBoostClassifier):
        cat_features = model.get_params().get('cat_features', [])
        train_df, test_data = preprocess_catboost(train_df, test_data, cat_features)

    for fold, (train_idx, val_idx) in enumerate(cv.split(train_df, y)):
        X_train = train_df.iloc[train_idx].reset_index(drop=True)
        y_train = y.iloc[train_idx].reset_index(drop=True)
        
        X_val = train_df.iloc[val_idx].reset_index(drop=True)
        y_val = y.iloc[val_idx].reset_index(drop=True)
        
        model = clone(model)
        
        eval_set = [(X_val, y_val)]

        if isinstance(model, LGBMClassifier):
            model.fit(
                X_train, y_train,
                eval_set=eval_set,
                callbacks=[early_stopping(50)],
            )
        elif isinstance(model, CatBoostClassifier):
            model.fit(
                X_train, y_train,
                eval_set=eval_set,
                early_stopping_rounds=50,
                verbose=False
            )

        else:
            model.fit(
                X_train, y_train,

            )
            
        val_probs = model.predict_proba(X_val)[:, 1]  # Get the probabilities
        val_preds = (val_probs > 0.5).astype(int)     # Convert probabilities to class labels for MCC

        val_score = matthews_corrcoef(y_val, val_preds)  # Calculate MCC
        print(f'Fold {fold}: MCC = {val_score:.5f}')
        
        val_scores.append(val_score)
        
        oof_preds[val_idx] = val_probs  # Store the probabilities for OOF predictions

        test_preds += model.predict_proba(test_data)[:, 1] / cv.get_n_splits()  # Aggregate test probabilities

    mean_val_score = np.mean(val_scores)
    std_val_score = np.std(val_scores)
    print(f'Mean Validation MCC: {mean_val_score:.7f}')
    print(f'Std Validation MCC: {std_val_score:.7f}')
    
    return val_scores, test_preds, oof_preds

In [None]:
cv_summary, test_preds, oof_preds = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

### XGboost

In [None]:
xgb_optuna_params = {
 'tree_method': 'gpu_hist',
 'n_estimators': 1696,
 'alpha': 4.956752183261538e-07,
 'subsample': 0.7349948172684168,
 'colsample_bytree': 0.30171411525842506,
 'max_depth': 15, 
 'min_child_weight': 6,
 'learning_rate': 0.013301072238797047,
 'gamma': 5.634602153104516e-08
}


xgb_tuned = XGBClassifier(**xgb_optuna_params, random_state=random_state)


xgb_pipeline = make_pipeline(encoder, xgb_tuned)

In [None]:
%%time
cv_summary['xgb'], test_preds['xgb'], oof_preds['xgb'] = cross_validate_score(xgb_pipeline, train_df , y,  cv, test_df)

### Lightgbm

In [None]:
lgbm_optuna_params = {
    'n_estimators': 10000,
    'learning_rate': 0.02,
    "categorical_feature" : categorical_features,
    'device': 'gpu',
    'max_depth': 10,
    'min_data_in_leaf': 85,
    'subsample': 0.6720606456166781,
    'max_bin': 240,
    'feature_fraction': 0.6946327643448142,

}



lgbm_tuned = LGBMClassifier(**lgbm_optuna_params, random_state=random_state, verbose=-1)

In [None]:
%%time
cv_summary['lgbm'], test_preds['lgbm'], oof_preds['lgbm'] = cross_validate_score(lgbm_tuned, train_df , y,  cv, test_df)

### Catboost

In [None]:
#Parameters found tuning process by Optuna
catb_params = {    
    "n_estimators" : 10000,
    "learning_rate" : 0.075,
    'cat_features' : categorical_features,
    'task_type': 'GPU',
    'random_strength': 0.3718364180573207,
    'max_bin': 128,
    'depth': 9,
    'l2_leaf_reg': 6,
    'grow_policy': 'SymmetricTree',
    'boosting_type': 'Plain',
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.41936688658110405
}

# Catb with found hyperparameters
catb_tunned = CatBoostClassifier(**catb_params, random_state=random_state)

In [None]:
%%time
cv_summary['catb'], test_preds['catb'], oof_preds['catb'] = cross_validate_score(catb_tunned, train_df , y,  cv, test_df)

### Performance Summary

In [None]:
#performance summary for base learners
transposed_df = cv_summary.transpose()
transposed_df.columns = ['fold1','fold2','fold3','fold4','fold5']
transposed_df['Mean'] = transposed_df.mean(axis=1)
transposed_df['Std'] = transposed_df.std(axis=1)
transposed_df.sort_values(by = 'Mean', ascending=False).style.background_gradient('Dark2_r')

In [None]:
# Model diversity check

sns.set(font_scale=1.1)
correlation_train = oof_preds.corr()
mask = np.triu(correlation_train.corr())
plt.figure(figsize=(20, 20))
sns.heatmap(correlation_train,
            annot=True,
            fmt='.3f',
            cmap='coolwarm',
            square=True,
            mask=mask,
            linewidths=1,
            cbar=False);

### Voting vs Stacking

#### Voting

In [None]:
#voting hard & soft
def voting_ensemble(oof_preds, y, threshold=0.5, voting_type='soft'):
    if voting_type == 'soft':
        ensemble_preds = oof_preds.mean(axis=1)
        ensemble_class_preds = (ensemble_preds > threshold).astype(int)
        
    elif voting_type == 'hard':
        binary_preds = (oof_preds > threshold).astype(int)
        ensemble_class_preds = mode(binary_preds, axis=1)[0].flatten()
    
    mcc_score = matthews_corrcoef(y, ensemble_class_preds)
    
    return mcc_score

In [None]:
voting_ensemble(oof_preds, y, voting_type='soft')

In [None]:
voting_ensemble(oof_preds, y, voting_type='hard')

#### Stacking

In [None]:
#parameters for meta model                                                                                                 
meta_model_params = {
    'C': 0.000237302749626327,
    'max_iter': 2500,
    'tol': 9.996751434702547e-05,
    'solver': 'saga',
    'penalty': 'l1'
}

meta_model = LogisticRegression(**meta_model_params, random_state=random_state)

In [None]:
%%time

#Deciding which models to include ensemble

min_features_to_select = 1

# Create a pipeline with preprocessor and RFECV
pipeline = Pipeline([
    
    ('Scaler', StandardScaler()),
    ('rfecv', RFECV(estimator=meta_model,
                    step=1,
                    cv=cv,
                    scoring=make_scorer(matthews_corrcoef),
                    min_features_to_select=min_features_to_select,
                    n_jobs=-1,))
])

# Fit the pipeline on oof_preds
pipeline.fit(oof_preds, y)

#CV score
print("Best CV score: ")
selected_models = np.array( oof_preds.columns)[pipeline.named_steps['rfecv'].support_]
print( pipeline.named_steps['rfecv'].cv_results_["mean_test_score"][len(selected_models) - 1])


# Selected models after RFECV
print('Number of available models:', len(oof_preds.columns))
print('Number of selected models for ensemble:', len(selected_models))
print("Selected models:", selected_models)

In [None]:
meta_model = meta_model.fit(oof_preds[selected_models], y)

In [None]:
preds_test =  meta_model.predict(test_preds[selected_models])

In [None]:
preds_test = le.inverse_transform(preds_test)

### Submission

In [None]:
output = pd.DataFrame({'id': test_df.index,
                       'class': preds_test})

In [None]:
output.to_csv('submission.csv', index=False)

In [None]:
output.head()

In [None]:
#save oofs and test predictions for later usage
oof_preds.to_parquet('oof_predictions_v01.parquet', index=False)
test_preds.to_parquet('test_predictions_v01.parquet', index=False)