## **Import Library**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import tqdm

from sklearn.metrics import mean_squared_error, roc_auc_score, roc_curve, accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from catboost import CatBoostRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

files = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        files.append(os.path.join(dirname, filename))
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## **Load Files**

In [None]:
train = pd.read_csv(files[1])
sub = pd.read_csv(files[2])
sample = pd.read_csv(files[0])

## **Files Info**

In [None]:
print("train shape :", train.shape,
"\nsub shape :", sub.shape)

In [None]:
train

In [None]:
print("\nTrain Info")
print(train.info())
print("\nSub Info")
print(sub.info())

In [None]:
print("columns which are not in TEST :")
[i for i in train.columns.tolist() if i not in sub.columns.tolist()]

In [None]:
train['target'].value_counts()

In [None]:
train.drop(['id'], axis=1, inplace=True)

In [None]:
train_0 = train[train['target'] == 0]
train_1 = train[train['target'] == 1]

eda_0 = train_0.sample(n=500)
eda_1 = train_1.sample(n=500)

train_0 = train_0.sample(n=int(len(train_0)*0.25))
train_1 = train_1.sample(n=int(len(train_1)*0.25))

In [None]:
train = (pd.concat([train_0, train_1])).sort_index()
train.reset_index(drop=True, inplace=True)

eda = (pd.concat([eda_0, eda_1])).sort_index()
eda.reset_index(drop=True, inplace=True)

In [None]:
print("\nTrain describe :")
display(train.iloc[:, :-1].describe().T.style.bar(subset=['mean'])\
                            .background_gradient(subset=['std'])\
                            .background_gradient(subset=['50%']))
print("\nSub describe :")
display(sub.describe().T.style.bar(subset=['mean'])\
                            .background_gradient(subset=['std'])\
                            .background_gradient(subset=['50%']))

**Split data for making some graph**

In [None]:
train_size = int(train.shape[0] * 0.8)
train_X  = train.iloc[:train_size, :-1]
train_y = train.iloc[:train_size, -1]
test_X = train.iloc[train_size:, :-1]
test_y = train.iloc[train_size:, -1]

In [None]:
fig, axes = plt.subplots(11, 11, figsize=(13, 13))
axes = axes.flatten()

for n, ax in enumerate(axes):
    sns.kdeplot(data=train_0, x=f'f{n}',ax=ax, fill=True)
    sns.kdeplot(data=train_1, x=f'f{n}',ax=ax, fill=True)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_ylabel('')
    ax.spines[:].set_visible(False)

fig.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(11, 11, figsize=(13, 13))
axes = axes.flatten()

for n, ax in enumerate(axes):
    sns.kdeplot(data=eda_0, x=f'f{n}',ax=ax, fill=True)
    sns.kdeplot(data=eda_1, x=f'f{n}',ax=ax, fill=True)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_ylabel('')
    ax.spines[:].set_visible(False)

fig.tight_layout()
plt.show()

In [None]:
cat_columns = []
print('column, len, values')
for i in train.columns.values:
    if len(set(train[i].values)) <= 2:
        cat_columns.append(i)
        print(i, len(set(train[i].values)), set(train[i].values))

In [None]:
fig, axes = plt.subplots(int(len(cat_columns)/5), 5, figsize=(12, 12))
axes = axes.flatten()

for n, ax in enumerate(axes):
    sns.countplot(data=train_0, x=cat_columns[n], ax=ax)
    sns.countplot(data=train_1, x=cat_columns[n], ax=ax)
    ax.set_yticks([])
    ax.set_ylabel('')
    ax.spines[:].set_visible(False)

fig.tight_layout()
plt.show()

## **Simple Random Forest**

In [None]:
model = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=6,
                               min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                               max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, 
                               bootstrap=True, oob_score=False, n_jobs=None,
                               random_state=None, verbose=0, warm_start=False,
                               class_weight=None, ccp_alpha=0.0, max_samples=None)

In [None]:
model.fit(train_X, train_y)

In [None]:
params ={'n_estimators':[50, 100], 'max_depth':[3, 6, 12], 'min_samples_leaf':[9, 12, 16], 'min_samples_split':[8, 16, 24]}

grid_cv = GridSearchCV(model, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(train_X, train_y)

In [None]:
print('Best Score : ', grid_cv.best_score_)
print('Best Parameter : ', grid_cv.best_params_)
print('Best Estimator : ', grid_cv.best_estimator_)

In [None]:
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

In [None]:
for f in range(train_X.shape[1]):
    print("%2d) %-*s %f" % (f+1, 30, train_X.columns[indices[f]], importances[indices[f]]))

In [None]:
sfm =  SelectFromModel(model, threshold=0.001, prefit=True)
X_selected = sfm.transform(train_X)
print("Samples over the threshold:", X_selected.shape[1])

In [None]:
for f in range(X_selected.shape[1]):
    print("%2d) %-*s %f" % (f+1, 30, train_X.columns[indices[f]], importances[indices[f]]))

In [None]:
plt.title('Feature Importance')
plt.bar(range(X_selected.shape[1]), importances[indices[:X_selected.shape[1]]], align='center')
plt.xticks(range(X_selected.shape[1]), train_X.columns[indices[:X_selected.shape[1]]], rotation=90)
plt.xlim([-1, 35])
plt.tight_layout()
plt.show()

In [None]:
pred_y = model.predict(test_X)

In [None]:
print('First Accuracy :', accuracy_score(test_y, pred_y))

In [None]:
pred_y2 = model.predict(sub.iloc[:, 1:])

In [None]:
submission = pd.DataFrame({"id": sub.id, "target": pred_y2})

In [None]:
submission.to_csv("submission.csv", index=False)