In [None]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df= pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
test_df= pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')

### **Data Analysis**

In [None]:
train_df.shape, test_df.shape

In [None]:
train_df.head()

In [None]:
train_df.duplicated().sum()

In [None]:
train_df.info()

In [None]:
test_df.head()

In [None]:
test_df.duplicated().sum()

In [None]:
test_df.info()

### **Exploratory Data Analysis**

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x="target", data=train_df, palette=('magma'))
plt.show()

**Let' collate numerical columns and find their distribution**

In [None]:
num_features = train_df[['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10']]

In [None]:
num_features.head()

In [None]:
num_features.hist(figsize = (11,11), color="#008080")

In [None]:
plt.figure(figsize = (10,8))
sns.heatmap(num_features.corr(), annot=True, cmap='magma')
plt.title('Numerical Features Correlation')
plt.show()

**Let' collate catagorical columns and find their distribution**

In [None]:
cat_features = train_df[['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18']]

In [None]:
cat_features.head()

### **Model Building**

In [None]:
for i in cat_features:
    plt.figure(figsize=(14,4))
    train_df[i].value_counts().plot(kind='bar', color="#008080")
    plt.title(i)
    plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder
for i in train_df.columns:
    if train_df[i].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(train_df[i].values) + list(test_df[i].values))
        train_df[i] = lbl.transform(train_df[i].values)
        test_df[i] = lbl.transform(test_df[i].values)

In [None]:
x = train_df.drop(['target'],axis=1)
y = train_df['target']

In [None]:
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
x_scaled= sc.fit_transform(x)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.30)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200, max_depth=9, random_state=1)
rf.fit(x_train, y_train)
predictions = rf.predict(x_test)

In [None]:
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_test, predictions)
score

In [None]:
rf1 = RandomForestClassifier(n_estimators=200, max_depth=7, random_state=1)
rf1.fit(x_scaled, y)
predictions1 = rf1.predict(test_df)

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv', index_col='id')
submission.head()

In [None]:
submission['target'] = rf1.predict_proba(test_df)[:, 1]
submission.to_csv('random_forest.csv')