## Importing Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns',50)

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Reading and Inspecting Data

In [None]:
df_train = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/train.csv", sep = ",", encoding = "ISO-8859-1", header = 0)
df_train.head()

In [None]:
df_test = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/test.csv", sep = ",", encoding = "ISO-8859-1", header = 0)
df_test.head()

In [None]:
df_train.info()

In [None]:
df_train.describe()

## EDA and Data Preparation

In [None]:
contVars = [col for col in df_train.columns if col.startswith('cont')]
catVars = [col for col in df_train.columns if col.startswith('cat')]

print("contVars : ")
print(contVars)

print("------------------\ncatVars : ")
print(catVars)

In [None]:
plt.figure(figsize = (15,7))
df_train[contVars].boxplot()
plt.show()

In [None]:
for col in catVars:
    df_train[col].value_counts(normalize = True).plot(kind = "barh", figsize = (15,5), title = col)
    plt.show()
    

In [None]:
df_train["target"].value_counts(normalize = True).plot(kind = "bar", figsize = (10,5), title = "Class Imbalance")

### Data Encoding

In [None]:
le = LabelEncoder()

for col in catVars:
    le.fit(df_train[col].append(df_test[col]))
    df_train[col] = le.transform(df_train[col])
    df_test[col] = le.transform(df_test[col])

In [None]:
df_train.head()

### Train-Test Split

In [None]:
X = df_train.drop(["id","target"], axis = 1)
y = df_train["target"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 42)

### Feature Scaling

In [None]:
scaler = MinMaxScaler()

cols = X_train.columns

X_train[cols] = scaler.fit_transform(X_train[cols])

X_train.head()

In [None]:
X_test[cols] = scaler.transform(X_test[cols])
X_test.head()

Fitting the same to the test dataset

In [None]:
df_test[cols] = scaler.transform(df_test[cols])
df_test.head()

## Create Model using Random Forest

In [None]:
# GridSearchCV on local machine gave the below parameters

rf = RandomForestClassifier(class_weight='balanced', criterion='entropy', max_depth=25, min_samples_leaf=75, n_estimators=150, random_state=42)

rf.fit(X_train,y_train)

In [None]:
# Checking ROC AUC Score for X_test 

metrics.roc_auc_score(y_true = y_test, y_score = rf.predict(X_test) )

### Predicting Test Dataset

In [None]:
pred_proba = rf.predict_proba(df_test.drop("id", axis = 1))

submission = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv")
submission["target"] = pred_proba[:,1]
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)