In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate
import warnings 
from xgboost import XGBClassifier

In [None]:
df_train = pd.read_csv(
    "../input/tabular-playground-series-sep-2021/train.csv",
    index_col='id'
)


FEATURES = df_train.columns[:-1]
TARGET = df_train.columns[-1]

df_train.head()

In [None]:
# Load test data
X_test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv", index_col='id')

In [None]:
# Let's see the skewness of a particular column before any processing and after processing with log and sqrt
colname = 'f110'
print("Skewness before processing:", abs(pd.Series(np.log(df_train[colname])).skew()))
# Signs were preserved during processing
print("Skewness after processing with log:", abs(pd.Series(np.log(np.abs(df_train[colname])) * np.sign(df_train[colname])).skew()))
print("Skewness after processing with sqrt:", abs(pd.Series(np.sqrt(np.abs(df_train[colname])) * np.sign(df_train[colname])).skew()))

In [None]:
# Reducing skewness improves your model. So let's process the columns having skewness > 2.5
for i in range(1, 119):
    colname = 'f' + str(i)
    skew_val = abs(pd.Series(np.log(df_train[colname])).skew())
    col_log = pd.Series(np.log(np.abs(df_train[colname])) * np.sign(df_train[colname]))
    log_skew_val = abs(col_log.skew())
    col_sqrt = pd.Series(np.sqrt(np.abs(df_train[colname])) * np.sign(df_train[colname]))
    sqrt_skew_val = abs(col_sqrt.skew())
    if skew_val > 2.5:
        if log_skew_val < sqrt_skew_val and log_skew_val < skew_val:
            df_train[colname] = col_log
            X_test[colname] = pd.Series(np.log(np.abs(X_test[colname])) * np.sign(X_test[colname]))
        elif sqrt_skew_val < log_skew_val and sqrt_skew_val < skew_val:
            df_train[colname] = col_sqrt
            X_test[colname] = pd.Series(np.sqrt(np.abs(X_test[colname])) * np.sign(X_test[colname]))
            
X = df_train[FEATURES]
y = df_train[TARGET]

In [None]:
# Define the model
model = XGBClassifier(
    max_depth=5,
    subsample=0.5,
    colsample_bytree=0.5,
    n_jobs=-1,
    # Uncomment if you want to use GPU. Recommended for whole training set.
    tree_method='gpu_hist',
    eval_metric='auc',
    learning_rate=0.03,
    n_estimators=1600,
    random_state=0,
)

In [None]:
# Check model performance with cross validation
warnings.filterwarnings('ignore')

def score(X, y, model, cv):
    scoring = ["roc_auc"]
    scores = cross_validate(
        model, X, y, scoring=scoring, cv=cv, return_train_score=True
    )
    scores = pd.DataFrame(scores).T
    return scores.assign(
        mean = lambda x: x.mean(axis=1),
        std = lambda x: x.std(axis=1),
    )

scores = score(X, y, model, cv=2)

display(scores)

In [None]:
# Fit on full training set
model.fit(X, y)

# Make predictions
y_pred = pd.Series(
    model.predict_proba(X_test)[:,1],
    index=X_test.index,
    name=TARGET,
)

# Create submission file
y_pred.to_csv("submission.csv")