In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv("../input/ghouls-goblins-and-ghosts-boo/train.csv.zip")
df_test = pd.read_csv("../input/ghouls-goblins-and-ghosts-boo/test.csv.zip")

In [None]:
df_train.head()

In [None]:
df_train.describe().T

In [None]:
df_test.describe().T

# Short **EDA**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# continuous values exploration
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(30, 40))

for i, col in enumerate(["bone_length", "rotting_flesh", "hair_length", "has_soul"]):
    sns.scatterplot(x = col, y = "type", data=df_train, ax = axes[i][0])
    sns.histplot(x = col, y = "type", data=df_train, ax = axes[i][1])
    sns.histplot(x = col, data=df_train, ax = axes[i][2])

> From plot we clearly can differentiate classes even better by combining features:
> 1. **has_soul * hair_length**
> 2. **has_soul + hair_length**
> 3. **has_soul * bone_length**
> 4. **has_soul + bone_length**
> 5. etc.

In [None]:
# exploration of one categorical variable - 'color'
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))
sns.histplot(x = 'color', y = "type", data=df_train, ax = axes[0], cbar=True)
sns.histplot(x = 'color', data=df_train, ax = axes[1])

> **color** seems totally random and noisy. Maybe it should be dropped later

# **Preprocessing** and **Feature Engineering**

In [None]:
# remove 'id' and 'color' from df
df_train.drop(columns=["id", "color"], inplace=True)
df_test.drop(columns=["id", "color"], inplace=True)

In [None]:
# Also I need to encode 'type'
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_train['type'] = le.fit_transform(df_train['type'])
df_train.head()

In [None]:
# add more combinations of useful features "bone_length", "rotting_flesh", "hair_length", "has_soul"
from itertools import combinations_with_replacement

for first_col, second_col in combinations_with_replacement(["bone_length", "rotting_flesh", "hair_length", "has_soul"], 2):
    df_train[f"{first_col} + {second_col}"] = df_train[first_col] + df_train[second_col]
    df_train[f"{first_col} * {second_col}"] = df_train[first_col] * df_train[second_col]
    df_test[f"{first_col} + {second_col}"] = df_test[first_col] + df_test[second_col]
    df_test[f"{first_col} * {second_col}"] = df_test[first_col] * df_test[second_col]

df_train.head()

In [None]:
# clusterize and add predicted cluster as a column
from sklearn.cluster import KMeans

clusterizer = KMeans(n_clusters=6)
clusterizer.fit(df_train.drop(columns=["type"]))

df_train['cluster'] = clusterizer.predict(df_train.drop(columns=["type"]))
df_test['cluster'] = clusterizer.predict(df_test)

df_train.head()

In [None]:
# let's check current clusters
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))
sns.histplot(x = 'cluster', y = "type", data=df_train, ax = axes[0], cbar=True)
sns.histplot(x = 'cluster', data=df_train, ax = axes[1])

> As plot shows "**cluster**" feature can be clearly useful

# Now, lets get to **Modeling**

In [None]:
import warnings

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import make_scorer
from sklearn.decomposition import PCA
from sklearn.svm import SVC

In [None]:
# train-test split of training df
X, y = df_train.drop(columns = ['type']), df_train['type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# now I will get compressed version of X for plotting
# total explained variance in this feature is impressive
reducer = PCA(n_components = 1)
X_truncated = reducer.fit_transform(X_test)
X_truncated = X_truncated.reshape(1, -1)[0]
reducer.explained_variance_ratio_.sum()

In [None]:
def model_metrics(y_true, y_predicted, y_predicted_proba):
    print(f"Accuracy : {accuracy_score(y_true, y_predicted)}")
    print(f"Precision : {precision_score(y_true, y_predicted, average = 'macro')}")
    print(f"Recall : {recall_score(y_true, y_predicted, average='macro')}")
    print(f"F1 Score : {f1_score(y_true, y_predicted, average='macro')}")
    print(f"AUC Score : {roc_auc_score(y_true, y_predicted_proba, multi_class='ovr')}")

def scatter_predicted_vs_true(x, y_true, y_predicted):
    fig, ax = plt.subplots()
    ax.scatter(x, y_true, c='green')
    ax.scatter(x, y_predicted, c='red')
    plt.show()

def train_model(model_type, **kwargs):
    model = model_type(**kwargs)
    model.fit(X_train, y_train)
    print(f"{str(model).split('(')[0]}")
    model_metrics(y_test, model.predict(X_test), model.predict_proba(X_test))
    return model

In [None]:
sgd_model = train_model(SGDClassifier, loss="log")
scatter_predicted_vs_true(X_truncated, y_test, sgd_model.predict(X_test))

In [None]:
rrf_model = train_model(RandomForestClassifier)
scatter_predicted_vs_true(X_truncated, y_test, rrf_model.predict(X_test))

In [None]:
log_model = train_model(LogisticRegression)
scatter_predicted_vs_true(X_truncated, y_test, log_model.predict(X_test))

In [None]:
svm_model = train_model(SVC, probability=True)
scatter_predicted_vs_true(X_truncated, y_test, svm_model.predict(X_test))

In [None]:
lindisc_model = train_model(LinearDiscriminantAnalysis)
scatter_predicted_vs_true(X_truncated, y_test, lindisc_model.predict(X_test))

In [None]:
quaddisc_model = train_model(QuadraticDiscriminantAnalysis)
scatter_predicted_vs_true(X_truncated, y_test, quaddisc_model.predict(X_test))

In [None]:
xgb_model = train_model(XGBClassifier, eval_metric = "logloss")
scatter_predicted_vs_true(X_truncated, y_test, xgb_model.predict(X_test))

In [None]:
lgbm_model = train_model(LGBMClassifier)
scatter_predicted_vs_true(X_truncated, y_test, lgbm_model.predict(X_test))

> Now I will try to tune params for **XGBClassifier**, **LGBMClassifier**, **RandomForestClassifier**, **LogisticRegression** and **LinearDiscriminantAnalysis** and then use **VotingClassifier**

In [None]:
rrf_param_grid = {
     'max_depth': [5, 10, 15], 
     'max_features': ['auto', 'sqrt'],
     'min_samples_leaf': [4, 8, 12], 
     'min_samples_split': [5, 10, 15], 
     'n_estimators': [600, 800, 1000] 
}

lgbm_param_grid = {
    "n_estimators": [100, 200, 400, 600, 1000],
    "learning_rate": [0.01, 0.1, 0.5, 1], 
    "num_leaves": [5, 10, 15, 20],
    "max_depth": [3, 6, 9, 12],
    "reg_lambda": [0.5, 1, 3, 5],
    "min_split_gain": [3, 6, 9], 
}

xgb_param_grid = {
    "n_estimators": [100, 250, 500, 1000],
    'gamma': [0.5, 1, 2, 5],
    'max_depth': [3, 6, 12],
    "reg_lambda": [1, 3, 5],
    "reg_alpha": [1, 3, 5],
}

log_param_grid = {
    "penalty" : ["l1", "l2"],
    "tol" : [0.0001,0.0002,0.0003],
    "max_iter": [100, 300, 500],
    "C" :[0.01, 0.1, 1, 10, 100],
}

lindisc_param_grid = {
    "solver": ['svd', 'lsqr', 'eigen'],
    "tol" : [0.0001,0.0002,0.0003],
}

In [None]:
optimal_lgbm_params = {'learning_rate': 1, 'max_depth': 3, 'min_split_gain': 3, 'n_estimators': 100, 'num_leaves': 5, 'reg_lambda': 5}
optimal_rrf_params = {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 8, 'min_samples_split': 5, 'n_estimators': 800}
optimal_lindisc_params = {'solver': 'svd', 'tol': 0.0001}
optimal_xgb_params = {'gamma': 1, 'max_depth': 12, 'n_estimators': 100, 'reg_alpha': 3, 'reg_lambda': 3}
optimal_log_params = {'C': 1, 'max_iter': 150, 'penalty': 'l2', 'tol': 0.0001}

In [None]:
if not optimal_rrf_params:
    rrf_grid = GridSearchCV(RandomForestClassifier(), param_grid=rrf_param_grid, cv=5, verbose=1)
    rrf_grid.fit(X_train, y_train)
    print(f"RandomForestClassifier with Grid")
    model_metrics(y_test, rrf_grid.predict(X_test), rrf_grid.predict_proba(X_test))
    print()
    print(f"RandomForestClassifier with defaults")
    model_metrics(y_test, rrf_model.predict(X_test), rrf_model.predict_proba(X_test))
    scatter_predicted_vs_true(X_truncated, y_test, rrf_grid.predict(X_test))
    print(rrf_grid.best_params_)

In [None]:
if not optimal_lgbm_params:
    lgbm_grid = GridSearchCV(LGBMClassifier(), param_grid=lgbm_param_grid, cv=5, verbose=1)
    lgbm_grid.fit(X_train, y_train)
    print(f"LGBMClassifier with Grid")
    model_metrics(y_test, lgbm_grid.predict(X_test), lgbm_grid.predict_proba(X_test))
    print()
    print(f"LGBMClassifier with defaults")
    model_metrics(y_test, lgbm_model.predict(X_test), lgbm_model.predict_proba(X_test))
    scatter_predicted_vs_true(X_truncated, y_test, lgbm_grid.predict(X_test))
    print(lgbm_grid.best_params_)

In [None]:
if not optimal_xgb_params:
    xgb_grid = GridSearchCV(XGBClassifier(tree_method='gpu_hist', eval_metric='mlogloss'), param_grid=xgb_param_grid, cv=5, verbose=1)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        xgb_grid.fit(X_train, y_train)
    print(f"XGBClassifier with Grid")
    model_metrics(y_test, xgb_grid.predict(X_test), xgb_grid.predict_proba(X_test))
    print()
    print(f"XGBClassifier with defaults")
    model_metrics(y_test, xgb_model.predict(X_test), xgb_model.predict_proba(X_test))
    scatter_predicted_vs_true(X_truncated, y_test, xgb_grid.predict(X_test))
    print(xgb_grid.best_params_)

In [None]:
if not optimal_log_params:
    log_grid = GridSearchCV(LogisticRegression(), param_grid=log_param_grid, cv=5, verbose=1)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        log_grid.fit(X_train, y_train)
    print(f"LogisticRegression with Grid")
    model_metrics(y_test, log_grid.predict(X_test), log_grid.predict_proba(X_test))
    print()
    print(f"LogisticRegression with defaults")
    model_metrics(y_test, log_model.predict(X_test), log_model.predict_proba(X_test))
    scatter_predicted_vs_true(X_truncated, y_test, log_grid.predict(X_test))
    print(log_grid.best_params_)

In [None]:
if not optimal_lindisc_params:
    lindisc_grid = GridSearchCV(LinearDiscriminantAnalysis(), param_grid=lindisc_param_grid, cv=5, verbose=1)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        lindisc_grid.fit(X_train, y_train)
    print(f"LogisticRegression with Grid")
    model_metrics(y_test, lindisc_grid.predict(X_test), lindisc_grid.predict_proba(X_test))
    print()
    print(f"LogisticRegression with defaults")
    model_metrics(y_test, lindisc_model.predict(X_test), lindisc_model.predict_proba(X_test))
    scatter_predicted_vs_true(X_truncated, y_test, lindisc_grid.predict(X_test))
    print(lindisc_grid.best_params_)

In [None]:
rrf_model = RandomForestClassifier(**optimal_rrf_params)
lda_model = LinearDiscriminantAnalysis(**optimal_lindisc_params)
lgbm_model = LGBMClassifier(**optimal_lgbm_params)
xgb_model = XGBClassifier(**optimal_xgb_params, eval_metric="mlogloss")
log_model = LogisticRegression(**optimal_log_params)

In [None]:
voting_classifier = VotingClassifier(estimators=[
    ('log', log_model), ('rrf', rrf_model), ('lda', lda_model), ('lgbm', lgbm_model), ('xgb', xgb_model)],
    voting='hard'
)
voting_classifier.fit(X_train, y_train)
voting_classifier.score(X_test, y_test)

# **Submissions**

In [None]:
preds = le.inverse_transform(voting_classifier.predict(df_test))
ids = pd.read_csv("../input/ghouls-goblins-and-ghosts-boo/sample_submission.csv.zip")['id']
pd.DataFrame({"id": ids, "type": preds}).set_index("id").to_csv("predictions.csv")

# **DNN** approach should be tried
# Other combination of classifiers