In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,10) # make plots a bit bigger

# Load Data

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv',index_col='id')
test_df = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv',index_col='id')

# Preprocess

## Add Features

In [None]:
import string
characters = list(string.ascii_uppercase)
def engineer_features(df):
    # decode f_27 feature
    # add one feature per character. Number of the feature says how many times is the letter contained in f_27
    # df['f_27']
    for ch in characters:
        df[ch] = df['f_27'].str.count(ch)
        
    # https://www.kaggle.com/code/slythe/relative-features-w-lightgbm
    # Get Unique letters
    #df["unique_text_str"] = df["f_27"].apply(lambda x :  ''.join([str(n) for n in list(set(x))]) )
    #df["unique_text_str"] = df["unique_text_str"].astype("category")
    df["unique_text_len"] = df.f_27.apply(lambda s: len(set(s)))
    
    df.drop('f_27',axis=1, inplace=True)
    
    # https://www.kaggle.com/competitions/tabular-playground-series-may-2022/discussion/323892
    df['i_02_21'] = (df.f_21 + df.f_02 > 5.2).astype(int) - (df.f_21 + df.f_02 < -5.3).astype(int)
    df['i_05_22'] = (df.f_22 + df.f_05 > 5.1).astype(int) - (df.f_22 + df.f_05 < -5.4).astype(int)
    i_00_01_26 = df.f_00 + df.f_01 + df.f_26
    df['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - (i_00_01_26 < -5.0).astype(int)
    

In [None]:
X_train = train_df.drop(['target'], axis = 1)
y_train = train_df['target']
X_test = test_df

engineer_features(X_train)
engineer_features(X_test)

submission = pd.DataFrame(index = X_test.index)  # prepare df for submission

display(X_train,y_train,X_test)

## Feature Selection

https://www.kaggle.com/code/prashant111/comprehensive-guide-on-feature-selection/notebook

### Remove Constant Features

In [None]:
# constant columns
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0.00)
sel.fit(X_train)  # fit finds the features with zero variance

In [None]:
# print constant features
to_drop = X_train.columns[~sel.get_support()]
to_drop

In [None]:
X_train.drop(to_drop,axis=1,inplace=True)
X_test.drop(to_drop,axis=1,inplace=True)

In [None]:
# # drop constant columns
# X_train = sel.transform(X_train)
# X_test = sel.transform(X_test)

In [None]:
X_train

# Model

In [None]:
from xgboost import XGBClassifier
model_xgb = XGBClassifier(
        n_estimators = 600,
        learning_rate = 0.2106438439544827,
        gamma = 0.27461206167034957,
        max_depth = 9,
        subsample = 0.9974663532226815,
        reg_lambda = 0.27324005545671887,
        reg_alpha = 4.80290864576712)

# https://www.kaggle.com/code/davidhammond/may-tp-xgboost-parameter-search

### Validation

For the time purpose I use just a subset of the data.

In [None]:
%%time
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import RocCurveDisplay

fig, ax = plt.subplots()
skf = StratifiedKFold(n_splits = 3)
for train,test in skf.split(X_train.head(30000),y_train.head(30000)):
    model_xgb.fit(X_train.iloc[train],y_train.iloc[train])
    #prediction = model_xgb.predict(X_train.iloc[test])
    #display(prediction)
    RocCurveDisplay.from_estimator(model_xgb, X_train.iloc[test], y_train.iloc[test], ax=ax)

ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8)
plt.show()

## Train and Predict

For the whole dataset.

In [None]:
%%time

model_xgb.fit(X_train, y_train)

In [None]:
y_xgb = model_xgb.predict_proba(X_test)
y_xgb

submission['xgb'] = y_xgb[:,1] # Metric is AUC -> the probabilities of 1 will yield better results

In [None]:
submission.plot.hist(bins = 100)

## Feature Importance

In [None]:
model_xgb.feature_importances_

In [None]:
# show feature importance
from xgboost import plot_importance
plot_importance(model_xgb, max_num_features = 30)
plt.show()

# Submit

In [None]:
submission.to_csv('submission.csv',columns=['xgb'], header=['target'],index=True)