In [None]:
import math
import numpy as np

import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier


plt.rcParams["figure.figsize"] = (20,20)

# Gathering the data

In [None]:
test_df = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
test_df

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
train_df

In [None]:
df = pd.concat([test_df, train_df]).sort_values(by='id')
df

In [None]:
df.info()

no missing data

# Target value

In [None]:
df['target'].value_counts()

In [None]:
df['target'].value_counts(normalize=True)

In [None]:
sns.countplot(x=df['target'])

In [None]:
df['target'] = df['target'].astype('int', errors='ignore')

In [None]:
def plot_multiple_cols(df, plot_method, cols=5):
    rows = math.ceil(df.shape[1] / cols)
    
    _, axs = plt.subplots(rows, cols)
    
    n = 0    
    for i in range(rows):
        for j in range(cols):
            if n >= len(df.columns):
                break
            
            plot_method(df[df.columns[n]], ax=axs[i][j])
            n = n+1

In [None]:
def plot_multiple_cols_vs_target(df, target, plot_method, cols=5):
    rows = math.ceil(df.shape[1] / cols)
    
    _, axs = plt.subplots(rows, cols)
    
    n = 0    
    for i in range(rows):
        for j in range(cols):
            if n >= len(df.columns):
                break
            
            plot_method(x=df[df.columns[n]], hue=target, ax=axs[i][j], multiple='layer')
            n = n+1

In [None]:
def value_counts_feat_vs_target(df, cols, target_col):
    for col in cols:
        print(df.groupby(by=col)[target_col].value_counts(normalize=True))

# Numeric

In [None]:
numeric_cols = df.select_dtypes('float').columns.drop('target')
numeric_cols

In [None]:
plot_multiple_cols(df[numeric_cols], sns.histplot, cols=4)

In [None]:
plot_multiple_cols_vs_target(df[numeric_cols], df['target'], sns.histplot, cols=4)

we can see that the distibution of cont' features almost remains the same but there are some difference.
let's take it into account and highlight this difference

In [None]:
df['cont1_1'] = (df['cont1'] > 0.9).astype('int').replace({0: 'A', 1: 'B'})

df['cont2_1'] = (df['cont2'] > 0.9).astype('int').replace({0: 'A', 1: 'B'})

df['cont3_1'] = (df['cont3'] < 0.2).astype('int').replace({0: 'A', 1: 'B'})

df['cont6_1'] = (df['cont6'] > 0.7).astype('int').replace({0: 'A', 1: 'B'})

df['cont8_1'] = (df['cont8'] > 0.5).astype('int').replace({0: 'A', 1: 'B'})

In [None]:
new_cols = ['cont1_1', 'cont2_1', 'cont3_1', 'cont6_1', 'cont8_1']

In [None]:
plot_multiple_cols_vs_target(df[new_cols], df['target'], sns.histplot, cols=4)

as we cam see, if those features are true, there are high probability for target to be true (except cont8)

# Categorical

In [None]:
cat_cols = df.select_dtypes('object').columns
df[cat_cols] = df[cat_cols].astype('category')

In [None]:
df[cat_cols].nunique()

In [None]:
plot_multiple_cols(df[cat_cols], sns.histplot)

let's separate binary and non-binary cols

## Non-binary

In [None]:
non_bin_cols = cat_cols[df[cat_cols].nunique() > 2]

In [None]:
plot_multiple_cols(df[non_bin_cols], sns.histplot)

let's separate large cat' features and small ones

In [None]:
### Large

In [None]:
large_cat_cols = non_bin_cols[df[non_bin_cols].nunique() > 20]


In [None]:
plot_multiple_cols(df[large_cat_cols], sns.histplot, cols=2)

In [None]:
plot_multiple_cols_vs_target(df[large_cat_cols], df['target'], sns.histplot, cols=2)

In [None]:
cat_dummy = pd.get_dummies(df['cat7'], prefix='cat7')
cat_dummy = cat_dummy[cat_dummy.columns[cat_dummy.sum() > 15000]]

In [None]:
plot_multiple_cols_vs_target(cat_dummy, df['target'], sns.histplot, cols=5)

In [None]:
df = pd.concat([df, cat_dummy], axis=1)

In [None]:
cat_dummy = pd.get_dummies(df['cat8'], prefix='cat8')
cat_dummy = cat_dummy[cat_dummy.columns[cat_dummy.sum() > 15000]]

In [None]:
plot_multiple_cols_vs_target(cat_dummy, df['target'], sns.histplot, cols=5)

In [None]:
df = pd.concat([df, cat_dummy], axis=1)

In [None]:
cat_dummy = pd.get_dummies(df['cat10'], prefix='cat10')
cat_dummy = cat_dummy[cat_dummy.columns[cat_dummy.sum() > 15000]]

In [None]:
plot_multiple_cols_vs_target(cat_dummy, df['target'], sns.histplot, cols=5)

In [None]:
df = pd.concat([df, cat_dummy], axis=1)

### Small

In [None]:
little_cat_cols = non_bin_cols[df[non_bin_cols].nunique() <= 20]

In [None]:
plot_multiple_cols(df[little_cat_cols], sns.histplot)

In [None]:
plot_multiple_cols_vs_target(df[little_cat_cols], df['target'], sns.histplot, cols=4)

In [None]:
for col in little_cat_cols:    
    cat_dummy = pd.get_dummies(df[col], prefix=col)
    cat_dummy = cat_dummy[cat_dummy.columns[cat_dummy.sum() > 15000]]
    df = pd.concat([df, cat_dummy], axis=1)

## Binary

In [None]:
bin_cols = df.columns[df.nunique() == 2]

In [None]:
plot_multiple_cols(df[bin_cols], sns.histplot, cols=5)

In [None]:
plot_multiple_cols_vs_target(df[bin_cols], df['target'], sns.histplot, cols=5)

In [None]:
value_counts_feat_vs_target(df, bin_cols, 'target')

In [None]:
for col in bin_cols:
    try:
        df[col] = df[col].cat.codes
    except:
        pass

# Modeling

In [None]:
train_df = df.dropna()

In [None]:
test_df = df.loc[df['target'].isna(), :]

In [None]:
X = train_df[bin_cols].drop('target', axis=1)
y = train_df['target']

## XGBoost

In [None]:
model = XGBClassifier(random_state=42,
                      use_label_encoder=False,
                      eval_metric='error',
                      
                      n_estimators=100,
                      learning_rate=0.1,
                      max_depth=10,
                      subsample=0.8,
                      colsample_bytree=0.8,
                      gamma=5,
                     )

In [None]:
models = []

kf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model.fit(X_train, y_train)
    models.append(model)
    
    print(len(models))
    print(roc_auc_score(y_train, model.predict_proba(X_train)[:, 1]))
    print(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
    print()       
    

# Submission

In [None]:
model1 = models[2]

In [None]:
result1 = model1.predict_proba(test_df[bin_cols].drop('target', axis=1))[:, 1]
                

In [None]:
model2 = models[4]

In [None]:
result2 = model1.predict_proba(test_df[bin_cols].drop('target', axis=1))[:, 1]
                

In [None]:
result = (result1 * 0.1 + result2 * 0.9) / 2

In [None]:
result

In [None]:
sns.scatterplot(result1, result2)

In [None]:
sns.histplot(result)

In [None]:
my_submission = pd.DataFrame({'id': test_df['id'], 'target': result})

my_submission.to_csv('submission_new.csv', index=False)