In [None]:
!pip install fasteda
!pip install opentsne

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor, LGBMClassifier, log_evaluation, early_stopping
from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, GroupKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR, SVC
from sklearn.ensemble import AdaBoostRegressor, VotingClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from tqdm import tqdm
from colorama import Fore, Back, Style

from sklearn.metrics import log_loss
pd.set_option('display.max_columns', 500)

from fasteda import fast_eda
from openTSNE import TSNE

In [None]:
train = pd.read_csv("/kaggle/input/playground-series-s3e26/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s3e26/test.csv")
original = pd.read_csv("/kaggle/input/cirrhosis-patient-survival-prediction/cirrhosis.csv")
sub = pd.read_csv("/kaggle/input/playground-series-s3e26/sample_submission.csv")

In [None]:
train["is_generated"] = 1
test["is_generated"] = 1
original["is_generated"] = 0

In [None]:
BOLD_TXT =  Style.BRIGHT
GREEN_TXT = BOLD_TXT + Fore.GREEN
RESET_TXT = Style.RESET_ALL

In [None]:
def preprocess(df):

    df["Drug"] = df["Drug"].map({"Placebo": 0, "D-penicillamine": 1})
    df["Sex"] = df["Sex"].map({"M": 0, "F": 1})
    
    for col in ["Ascites", "Hepatomegaly", "Spiders"]:
        df[col] = df[col].map({"N": 0, "Y": 1})
        
    df["Edema"] = df["Edema"].map({"N": 0, "S": 1, "Y": 1})
        
    return df

def scale_data(df):    
    
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df[FEATURES])
    df[FEATURES] = scaled_data
    
    return df

In [None]:
for df in [train, test, original]:
    df = preprocess(df)
   # df = scale_data(df)

In [None]:
TARGET = "Status"
FEATURES = [col for col in train.columns if col not in ["id", TARGET]]

FEATURES

In [None]:
train[TARGET] = train[TARGET].map({"D": 0, "CL": 1, "C": 2})
original[TARGET] = original[TARGET].map({"D": 0, "CL": 1, "C": 2})

In [None]:
TSNE_FEATURES = list(test.drop(columns = ["id", "is_generated"]))

train_tsne = train[TSNE_FEATURES].values
y_tsne = train["Status"].values

In [None]:
fast_eda(train, target = TARGET)

In [None]:
%%time
plot_tsne = TSNE().fit(train_tsne)

In [None]:
plt.style.use("dark_background")
plt.figure(figsize = (12, 10))

marker_sizes = [15 if hue == 1 else 3 for hue in y_tsne]

ax = sns.scatterplot(x = plot_tsne[:,0], y = plot_tsne[:,1], s = marker_sizes, hue = y_tsne,
                     linewidth = 0, palette = ["#ff3300", "#ffffff", "#33cc33"])
plt.grid(False)
plt.title(f"TSNE plot w/ target = {TARGET} | train data")
plt.show()

In [None]:
def train_model(train_data, model, features, n_splits, kfold_seed, include_orig):
    
    model_name = str(model).split("(")[0]
    test_preds = np.zeros((len(test), 3))
    oof_full = np.zeros((len(train_data), 3))
    val_scores, models = [], []
    
    print(model_name)
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=kfold_seed)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=kfold_seed)
    
    for i, (train_idx, val_idx) in enumerate(tqdm(skf.split(train_data[features], train_data[TARGET]))):

        X_train, X_val = train_data[features].loc[train_idx], train_data[features].loc[val_idx]
        y_train, y_val = train_data[TARGET].loc[train_idx], train_data[TARGET].loc[val_idx]
        
        if include_orig:
            X_train = pd.concat([X_train, original[features]], ignore_index = True)
            y_train = pd.concat([y_train, original[TARGET]], ignore_index = True)
        
        if model_name in ["LGBMRegressor", "LGBMClassifier"]:
            callbacks = [early_stopping(stopping_rounds=50)]
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=callbacks)
        elif model_name in ["XGBClassifier", "CatBoostClassifier"]:
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=10000)
        else:
            model.fit(X_train, y_train)
        
        oof_preds = model.predict_proba(X_val[features])
        test_preds += model.predict_proba(test[features]) / n_splits
 
        oof_full[val_idx] = oof_preds
        score = log_loss(y_val, oof_preds)
        
        models.append(model)
        val_scores.append(score)
            
        print(f"{GREEN_TXT}FOLD {i + 1} log_loss: {round(score, 4)}{RESET_TXT}")
        
    print(f'{GREEN_TXT}mean log_loss across all folds: {np.mean(val_scores):.5f}{RESET_TXT}')
    print(f'{GREEN_TXT}std of log_loss across all folds: {np.std(val_scores):.5f}{RESET_TXT}')
    
    return oof_full, test_preds, models

In [None]:
xgb_params = {
    'objective': 'multi_logloss', 
    'max_depth': 9, 
    'learning_rate': 0.034869481921747415, 
    'n_estimators': 10000,
    'early_stopping_rounds': 50,
    'min_child_weight': 9, 
    'colsample_bytree': 0.2, 
    'reg_alpha': 0.10626128775335533, 
    'reg_lambda': 0.624196407787772, 
    'random_state': 42
}

lgb_params = {
    'objective': 'multi_logloss', 
    'max_depth': 9, 
    'min_child_samples': 14, 
    'learning_rate': 0.034869481921747415, 
    'n_estimators': 100000, 
    'min_child_weight': 9, 
    'colsample_bytree': 0.1702910221565107, 
    'reg_alpha': 0.10626128775335533, 
    'reg_lambda': 0.624196407787772, 
    'random_state': 42
}

In [None]:
xgb_oof_preds, xgb_test_preds, xgb_models = train_model(
    train_data = train,
    model = XGBClassifier(**xgb_params),
    features = FEATURES,
    n_splits = 10,
    kfold_seed = 0,
    include_orig = True
)

In [None]:
lgb_oof_preds, lgb_test_preds, lgb_models = train_model(
    train_data = train,
    model = LGBMClassifier(**lgb_params),
    features = FEATURES,
    n_splits = 10,
    kfold_seed = 0,
    include_orig = True
)

In [None]:
sub[f"{TARGET}_D"] =  xgb_test_preds[:, 0] * 0.5 + lgb_test_preds[:, 0] * 0.5
sub[f"{TARGET}_CL"] = xgb_test_preds[:, 1] * 0.5 + lgb_test_preds[:, 1] * 0.5
sub[f"{TARGET}_C"] =  xgb_test_preds[:, 2] * 0.5 + lgb_test_preds[:, 2] * 0.5
sub.head(3)

In [None]:
plt.style.use("default")
plt.figure(figsize = (12, 10))

sns.histplot(df, x = sub["Status_C"], kde = True, label = "Status_C")
sns.histplot(df, x = sub["Status_CL"], kde = True, label = "Status_CL")
sns.histplot(df, x = sub["Status_D"], kde = True, label = "Status_D")

plt.xlabel("Predictions")
plt.legend()
plt.show()

In [None]:
sub.to_csv("submission.csv", index = False)