<a href="https://www.kaggle.com/code/riyosha/mushroom-binary-prediction-eda?scriptVersionId=193049157" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

from sklearn.impute import SimpleImputer

Exploratory Data Analysis

In [None]:
data_path = '/kaggle/input/playground-series-s4e8'
train = pd.read_csv(data_path+'/train.csv')
test = pd.read_csv(data_path+'/test.csv')
samplesubmission = pd.read_csv(data_path+'/sample_submission.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print(train.shape)
print(train.columns)
print('Test to Train ratio: ', test.shape[0]/train.shape[0])
print(train.describe(include='all'))

In [None]:
samplesubmission.head()

Missing/Wrong Values

In [None]:
print('Rows with at least 1 missing value: ', train.isna().any(axis=1).sum())
print(train.isna().sum())
print(test.isna().sum())
# almost all rows have at least 1 missing values, however,
# stem-root, stem-surface, veil-type, veil-color,spore-print-color have > 50% missing values in both test and train sets. 
# we'll drop these features, assuming they are MCAR or MAR.

In [None]:
train = train.drop(columns=['stem-root','stem-surface','veil-type','veil-color','spore-print-color'])

In [None]:
for col in train.columns:
    print(col, ':', train[col].unique())

# many values in the categorical columns have entries that don't make sense (numbers or phrases)
# we'll replace them with 'missing'

In [None]:
import re
# function to deal with missing and nonsensical values
def clean_cats(string):
    if pd.isna(string):
        return 'missing'
    elif type(string)!=str:
        return 'missing'
    ans=None
    words=string.split()
    for word in words:
        word = re.sub(r'[^a-zA-Z]', '', word)
        if len(word)==1:
            ans=word
    if ans == None or ans=='':
        ans='missing'
    return ans


In [None]:
for col in train.columns:
    if train[col].dtype==object:
     train[col]=train[col].apply(lambda x: clean_cats(x))

In [None]:
print('Rows with at least 1 missing value: ', train.isna().any(axis=1).sum())
print(train.isna().sum())


In [None]:
train=train.dropna()

train['id']=pd.to_numeric(train['id'])
train['cap-diameter']=pd.to_numeric(train['cap-diameter'])
train['stem-height']=pd.to_numeric(train['stem-height'])
train['stem-width']=pd.to_numeric(train['stem-width'])

I've chosen not to impute or replace missing values as my model seemed to perform marginally better when it simply used the missing category instead 

Finding significant features

In [None]:
from scipy.stats import chi2_contingency

# finds if feature is related to class
def significant_features(df, target='class',alpha=0.05):
    ans={}
    for col in df.columns:
        if col!=target and df[col].dtype == 'object':
            contingency_table = pd.crosstab(df[target],df[col])
            chi2, p,_,_ = chi2_contingency(contingency_table)
            if p<alpha:
                ans[col] = p
                print(f'{col} has p-value {p}')
    return ans

In [None]:
sig_features=significant_features(train)

sig_features=list(sig_features.keys())+['cap-diameter','stem-width','stem-height','class']
processed_train=train[sig_features]


Rare Categories

In [None]:
# Many categories in the columns barely have any data. we'll categorise these as noise 

for col in processed_train.columns:
    if processed_train[col].dtype=='category':
        counts = processed_train[col].value_counts().sort_values(ascending=False)
        plt.figure(figsize=(6,3))
        sns.barplot(x=counts.index, y=counts.values)
        plt.title(f'{col} Frequencies')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plt.show()

In [None]:
#this will return the categories that contain 98% of the entire data. least frequent categories will be removed
def main_categories(df,col,threshold=0.98):
    n = len(df[col])
    counts=df[col].value_counts().sort_values()
    counts =pd.DataFrame({'Category':counts.index,'Frequency':counts.values})
    counts['Proportion']=counts['Frequency']/n
    counts = counts.sort_values(by='Proportion', ascending=True).reset_index(drop=True)
    counts['Cumulative_Proportion'] = counts['Proportion'].cumsum()
    
    return counts[counts['Proportion'] > 1-threshold]['Category'].to_list()

In [None]:
main_categs={}
for col in processed_train.columns:
    if processed_train[col].dtype=='object':
        main_cats = main_categories(processed_train,col)+['missing']
        main_categs[col]=main_cats
        processed_train[col]=processed_train[col].apply(lambda x: x if x in main_cats else 'noise')
        processed_train[col]=processed_train[col].astype('category')

In [None]:
for col in processed_train.columns:
    print(col, ':', processed_train[col].unique())

Building the model - XGBoost (to be updated)

In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report, confusion_matrix, accuracy_score,matthews_corrcoef

In [None]:
X = processed_train.drop(columns=['class'])
Y = processed_train['class']
print(X.shape,Y.shape)

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(Y)

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
def objective(trial):

    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'grow_policy':'depthwise',
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'reg_alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 10.0),
        'enable_categorical': True
    }

    model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_val)
    mcc = matthews_corrcoef(Y_val, Y_pred)
    trial.set_user_attr("mcc", mcc)
    return mcc

# Define a function for printing MCC score
def print_mcc_callback(study, trial):
    mcc = trial.user_attrs["mcc"]
    print(f"Trial {trial.number}: MCC = {mcc:.5f}, Best MCC = {study.best_value:.5f}")
    print(f"Parameters: {trial.params}")


In [None]:
'''n_trials = 100
progress_bar = tqdm(total=n_trials)

# Define a custom callback to update the progress bar
def progress_bar_callback(study, trial):
    progress_bar.update(1)

# Example usage: Running the optimization with 100 trials
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_trials, callbacks=[print_mcc_callback, progress_bar_callback])

# Close the progress bar
progress_bar.close()

best_params = study.best_params
print(f"Best parameters: {best_params}")'''

In [None]:
parameters ={'n_estimators': 432, 'max_depth': 18, 'learning_rate': 0.019177494166556952, 'subsample': 0.6944494028059239, 'colsample_bytree': 0.5177980824894136, 'gamma': 0.0004342336537981622, 'lambda': 1.3527652792856453e-06, 'alpha': 4.10797226500692e-08, 'scale_pos_weight': 1.0121323580230017,'enable_categorical': True }

In [None]:
#this is a placeholder 
parameters = {
    'n_estimators': 100,  # Fewer trees
    'max_depth': 5,       # Simpler model
    'learning_rate': 0.1, 
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'lambda': 1.0,
    'alpha': 1.0,
    'scale_pos_weight': 1.0,
    'enable_categorical': True
}

In [None]:
model = XGBClassifier(**parameters)
model = model.fit(X, Y)

In [None]:
Y_pred = model.predict(X_val)
mcc = matthews_corrcoef(Y_val, Y_pred)
print(f"Matthews Correlation Coefficient: {mcc}")

In [None]:
# now let's preprocess the test df as well
test['cap-diameter']=pd.to_numeric(test['cap-diameter'])
test['stem-height']=pd.to_numeric(test['stem-height'])
test['stem-width']=pd.to_numeric(test['stem-width'])
sig_features.remove('class')
processed_test=test[sig_features]


for col in processed_test.columns:
    if processed_test[col].dtype==object:
        processed_test[col]=processed_test[col].apply(lambda x: clean_cats(x))
        main_cats=main_categs[col]
        processed_test[col]=processed_test[col].apply(lambda x: x if x in main_cats else 'noise')
        processed_test[col]=processed_train[col].astype('category')
     

In [None]:
id = test.pop('id')

In [None]:
# Make predictions on the test data
y_test_pred = model.predict(processed_test)
y_test_pred_binary = (y_test_pred > 0.502).astype(int)  # Convert probabilities to binary outcomes

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'id': id,
    'class': y_test_pred_binary
})

# Map the binary predictions to 'e' and 'p'
submission_df['class'] = np.where(submission_df['class'] == 1, 'p', 'e')

# Save the submission DataFrame to a CSV file
submission_df.to_csv('XGboost_model5.1_submission.csv', index=False)
print("Submission file created: submission.csv")


LightGBM Boosting