In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
import datetime

import plotly.express as px
import plotly.graph_objs as go

from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Sneak Peak into Dataset

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/test.csv')

In [None]:
train.head()

## Handy Functions

In [None]:
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values
    return summary


## Function to compute outliers for normally distributed column
def CalcOutliers(df_num): 

    # calculating mean and std of the array
    data_mean, data_std = np.mean(df_num), np.std(df_num)

    # seting the cut line to both higher and lower values
    # You can change this value
    cut = data_std * 3

    #Calculating the higher and lower cut values
    lower, upper = data_mean - cut, data_mean + cut

    # creating an array of lower, higher and total outlier values 
    outliers_lower = [x for x in df_num if x < lower]
    outliers_higher = [x for x in df_num if x > upper]
    outliers_total = [x for x in df_num if x < lower or x > upper]

    # array without outlier values
    outliers_removed = [x for x in df_num if x > lower and x < upper]
    
    print('Identified lowest outliers: %d' % len(outliers_lower)) # printing total number of values in lower cut of outliers
    print('Identified upper outliers: %d' % len(outliers_higher)) # printing total number of values in higher cut of outliers
    print('Total outlier observations: %d' % len(outliers_total)) # printing total number of values outliers of both sides
    print('Non-outlier observations: %d' % len(outliers_removed)) # printing total number of non outlier values
    print("Total percentual of Outliers: ", round((len(outliers_total) / len(outliers_removed) )*100, 4)) # Percentual of outliers in points
    
    return

In [None]:
resumetable(train)

In [None]:
resumetable(test)

## Unique Values Counts

In [None]:
t1 = resumetable(train)
t2 = resumetable(test)

feature_cols = [f for f in train.columns if f.startswith('feature')]
t1_ = t1.loc[t1['Name'].isin(feature_cols), ['Name','Uniques']].rename({'Uniques':'Train_Uniques'}, axis=1)
t2_ = t2.loc[t2['Name'].isin(feature_cols), ['Uniques']].rename({'Uniques':'Test_Uniques'}, axis=1)
t_concat = pd.concat([t1_,t2_], axis=1)

fig = go.Figure()
fig.add_trace(go.Bar(x=t_concat['Name'], y=t_concat['Train_Uniques'],
                marker_color='crimson',
                name='Train Unique Count'))
fig.add_trace(go.Bar(x=t_concat['Name'], y=t_concat['Test_Uniques'],
                marker_color='lightslategrey',
                name='Test Unique Count'
                ))

fig.update_layout(title='Unique Count acorss Dataset')
fig.show()

Key Observation : 
- There are few features where there is a difference in the unique values across Test and Train Dataset. Let's try to take a closer look at those features :

In [None]:
t_concat['Difference'] = t_concat['Train_Uniques']- t_concat['Test_Uniques']
t_concat[t_concat['Difference']!=0].sort_values(by='Difference')\
            .plot(x='Name', y='Difference', kind='bar')

plt.title('Count Difference between Train and Test Dataset', fontsize='16')
plt.xlabel('Feature Names')
plt.ylabel('Count')
plt.tight_layout()

Key Observation : 
- There are 6 features where there is a difference in unique value counts.  

In [None]:
f15 = np.setdiff1d(train['feature_15'].values, test['feature_15'].values)
f46 = np.setdiff1d(train['feature_46'].values, test['feature_46'].values)
f73 = np.setdiff1d(train['feature_73'].values, test['feature_73'].values)
f28 = np.setdiff1d(train['feature_28'].values, test['feature_28'].values)
f59 = np.setdiff1d(train['feature_59'].values, test['feature_59'].values)
f60 = np.setdiff1d(train['feature_60'].values, test['feature_60'].values)

print(f"feature_15 : There are {len(f15)} different values in Train dataset and these values are : {f15}")
print(f"feature_46 : There are {len(f46)} different values in Train dataset and these values are : {f46}")
print(f"feature_73 : There are {len(f73)} different values in Train dataset and these values are : {f73}")
print(f"feature_28 : There are {len(f28)} different values in Train dataset and these values are : {f28}")
print(f"feature_59 : There are {len(f59)} different values in Train dataset and these values are : {f59}")
print(f"feature_60 : There are {len(f60)} different values in Train dataset and these values are : {f60}")

### Most Common Unique Values 

In [None]:
arr_train = [train[col].values for col in feature_cols]
arr_test = [test[col].values for col in feature_cols]

train_array = np.sort(np.concatenate(arr_train, axis=0))
test_array = np.sort(np.concatenate(arr_test, axis=0))

unique1, counts1 = np.unique(train_array, return_counts=True)
unique2, counts2 = np.unique(test_array, return_counts=True)

df_ = pd.DataFrame([unique1, counts1, unique2, counts2]).T\
                            .rename({0:"Train Unique Value", 1:"Train Unique Count", 2:"Test Unique Value", 3:"Test Unique Count"}, axis=1)

In [None]:
df_graph = df_.head(20)
fig = go.Figure()
# Create and style traces
fig.add_trace(go.Scatter(x=df_graph['Train Unique Value'], y=df_graph['Train Unique Count'], name = 'Train Unique Count',
                         line=dict(color='royalblue', width=4, dash='dashdot')))
fig.add_trace(go.Scatter(x=df_graph['Train Unique Value'], y=df_graph['Test Unique Count'], name = 'Test Unique Count',
                         line=dict(color='firebrick', width=4,
                              dash='dash')))

fig.update_layout(title='Top 10 Unique Count of Train and Test Dataset Values',
                   xaxis_title='Unique Values in Dataset',
                   yaxis_title='Unique Count')

fig.show()

### Desriptive Analysis

In [None]:
train.drop('id', axis=1).describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='PuRd')\
                            .background_gradient(subset=['max'], cmap='Greens')

In [None]:
test.drop('id', axis=1).describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='PuRd')\
                            .background_gradient(subset=['max'], cmap='Greens')

### Numeric Features Distribution

In [None]:
numeric_features = train.select_dtypes('int64').columns.to_list()
numeric_features.remove('id')

In [None]:
plt.subplots(25,3, figsize=(25,60))
plt.suptitle("Distribution of Numeric Features in Train Dataset", fontsize=20, y=1)

for e, feature in enumerate(numeric_features,1):
    plt.subplot(25,3,e)
    sns.histplot(np.log1p(train[feature]), kde=True)
        
    plt.xlabel('Values', size=12, labelpad=15)
    plt.ylabel('Count', size=12, labelpad=15)    
    plt.title(f"{feature.capitalize()}", size=16)

plt.tight_layout(h_pad=2)
plt.show()

### Target Variable Distribution

In [None]:
target_count = (train['target'].value_counts(normalize=True)*100).round(1).to_frame()

plt.subplots(1,1, figsize=(15,7))
plt.suptitle('Target Variable/Classes Distribution', size=22)

plt.subplot(1,1,1)
g = sns.countplot(x='target', data=train, order=target_count.index)
gt = g.twinx()
gt = sns.pointplot(x= target_count.index, y= 'target', data=target_count, color='black')
gt.set_ylabel("% of Class Distribution", fontsize=16)

g.set_xlabel('Classes', size=20, labelpad=15)
g.set_ylabel('Count', size=20, labelpad=15)    

sizes=[]
total = len(train)

for p in g.patches:
    height = p.get_height()
    sizes.append(height)
    g.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format((height/total)*100),
            ha="left", fontsize=14)

plt.tight_layout()
plt.show()

### Bi-Variate Analysis

In [None]:
# Find correlations among feature columns 
corr = train[feature_cols].corr()

# Heatmap of correlations
fig, ax = plt.subplots(figsize=(35,15))
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(corr,
         square=True, center=0, linewidth=0.2,
         cmap=sns.diverging_palette(240, 10, as_cmap=True),
         mask=mask, ax=ax) 

ax.set_title('Feature Correlation', loc='left', fontweight='bold')
plt.show()

In [None]:
train_agg = train.groupby('target')[[col for col in train.columns if(col.startswith("feature"))]].mean()
train_agg

In [None]:
train_idmax = train_agg.idxmax(axis=1).to_frame().reset_index().rename({0:'Feature'}, axis=1)
train_max = train_agg.max(axis=1).to_frame().reset_index(drop=True).rename({0:'Mean Value'}, axis=1)

train_combined = pd.concat([train_idmax, train_max], axis=1)
train_combined

In [None]:
import plotly.express as px

fig = px.bar(train_combined, x='target', y='Mean Value',
             hover_data=['Feature', 'Mean Value'], color='Feature',
             labels={'pop':'Mean Value'}, height=400, title="Top Feature affecting the Target Variable/Classes based on Mean")
fig.show()

Let's deep dive in feature_19 and feature_54 variable.

## Data Pre-processing

#### Train Dataset

In [None]:
train_model = train.copy()

from sklearn import preprocessing
  
label_encoder = preprocessing.LabelEncoder()
train_model['target']= label_encoder.fit_transform(train_model['target'])

y=train_model['target']

In [None]:
X = train_model[feature_cols]

X = StandardScaler().fit_transform(X)
X

### PCA

In [None]:
from sklearn.decomposition import PCA

pca =  PCA(n_components= 30)
pca.fit(X)
pca_samples = pca.transform(X)

In [None]:
ps = pd.DataFrame(pca_samples)
ps.head()

## Model Development

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

print(f"Train and Test Shape: \n {[x.shape for x in [X_train, X_test, y_train, y_test]]}")

### Model 1 : Logistic Regression

In [None]:
lr = LogisticRegression(penalty='l2',random_state=42)

lr.fit(X_train, y_train)

# Predict y_pred
y_pred = lr.predict_proba(X_test)
log_loss_score_lr = log_loss(y_test,y_pred)
print(f"Logistic Regression : {log_loss_score_lr}")

### Model 2 : Random Forest

In [None]:
# Hyperparameter grid
param_grid = {
    "criterion" : ["gini", "entropy"], 
    "min_samples_leaf" : [1, 5, 10], 
    "min_samples_split" : [2, 4, 10, 12, 16], 
    "n_estimators": [50, 100, 150, 200],
    'max_depth': list(np.linspace(5, 30).astype(int)),
    'max_leaf_nodes': list(np.linspace(10, 50, 500).astype(int)),
    'bootstrap': [True, False]
}

# Estimator for use in random search
rf = RandomForestClassifier(max_features='auto', oob_score=True, random_state=1, n_jobs=-1)

In [None]:
# Create the random search model
rs = RandomizedSearchCV(rf, param_grid, n_jobs = -1, 
                        cv = 3, verbose = 1, random_state=1)

In [None]:
rs.fit(X_train, y_train)

# Predict y_pred
y_pred = rs.predict_proba(X_test)
log_loss_score_rs = log_loss(y_test,y_pred)
print(f"RandomForestClassifier : {log_loss_score_rs}")

## Predict Test Dataset

In [None]:
test_model = test.drop('id', axis=1)

X_test = test_model[feature_cols]

X_test = StandardScaler().fit_transform(X_test)
X_test

In [None]:
y_pred_test_dataset = rs.predict_proba(X_test)
y_pred_test_dataset

In [None]:
test_pred=pd.DataFrame(y_pred_test_dataset)
test_pred

In [None]:
test_pred.columns = label_encoder.inverse_transform(test_pred.columns)
test_pred

In [None]:
final_submiss = pd.concat([test.id, test_pred], axis=1)
final_submiss

### Final Submission

In [None]:
final_submiss.to_csv("result.csv",index=False)

### Please Upvote if you like it !