In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
tqdm.pandas()
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sample_submission = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv')
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv')

In [None]:
train_df.head()

As per the data description, cat0-cat18 are categorical columns and cont0-cont10 are numerical columns.

In [None]:
train_df.shape

In [None]:
test_df.shape

### Check for Null values.

In [None]:
train_df.isna().sum()

In [None]:
cont_cols = [x for x in train_df.columns if 'cont' in x]
cat_cols = [x for x in train_df.columns if 'cat' in x]

In [None]:
train_df[cont_cols].hist(color='k', alpha=0.5, bins=25, figsize=(20, 10))

1. Looks like for some columns like cont3 and cont4, we can convert to categorical values by bucketing.  
2. Also need to take a look at the right "tails" of cont0, cont10. 


In [None]:
for i,var in enumerate(cont_cols):
    sns.displot(train_df, x=var, hue="target", kind="kde", ax=[axs[i//3],axs[i%3]])
    sns.displot(test_df, x=var, kind="kde")

Distribution betweeen train and test group seem to match.

In [None]:
train_df[cat_cols].describe()

Some columns have large number of unique values,  maybe we can try some bucketing later on.

In [None]:
train_df['target'].value_counts()

So, this is a imbalanced calssification problem.

### Let us try some modelling.

Preprocessing first

In [None]:
for col in cat_cols:
    le = preprocessing.LabelEncoder()
    full = train_df[col].append(test_df[col])
    le.fit(full)
    train_df[col] = le.transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
X = train_df.drop(['id', 'target'], axis=1)
y = train_df['target']
X_test = test_df.drop(['id'], axis=1)

In [None]:
NUM_FOLDS = 5
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=137)
train_score_dict = {}
test_preds = 0

pred = []

for i, (train_ind, val_ind) in tqdm(enumerate(kf.split(X, y))):
        print(f'Index {i}')
        train_df, val_df = X.iloc[train_ind], X.iloc[val_ind]
        train_target, val_target = y.iloc[train_ind], y.iloc[val_ind]
        
        
        clf = RandomForestClassifier()
        clf =  clf.fit(train_df, train_target)
        temp_oof = clf.predict_proba(val_df)[:,1]
        temp_test = clf.predict_proba(X_test)[:,1]

        train_score_dict[i] = temp_oof
        test_preds += temp_test/NUM_FOLDS
        pred.append(temp_test)
        
        print(roc_auc_score(val_target, temp_oof))

In [None]:
final_pred = []
for i in range(len(pred[0])):
    final_pred.append(mean(pred[0][i],pred[1][i],pred[2][i],pred[3][i],pred[4][i]))

In [None]:
sample_submission['target'] = final_pred
sample_submission.to_csv('submission_as.csv', index=False)