In [None]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import missingno as msno
from scipy.stats import skew
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from functools import partial
import optuna

In [None]:
df=pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
df.head()

### Checking for null values
* missingno is a library which heps in visualizing missing values in our dataset

In [None]:
msno.matrix(df)

### From above we infer that no column has any missing vlaue

## **Exploring Categorical Columns**

### Checking for number of unique values per categorical column

In [None]:
cat_cols=df.select_dtypes('object')

print('Number of unique values for each categorical feature \n')
for col in cat_cols:
    print(f'{col} : {df[col].nunique()}')

### Cat10 has the maximum number of unique values followed by cat5. Let's plot a pie chart illustrating the top 3 maximum value counts in these categorical features

In [None]:
explode=[0.1,0.1,0.1]
cmap=plt.get_cmap('Paired')
colors=[cmap(i) for i in np.linspace(0,1,3)]

cols=['cat5','cat10']
fig=plt.figure(figsize=(15,15))
for i,col in enumerate(cols):
    fig.add_subplot(1,2,i+1)
    fig.set_size_inches(12,11)
    pie=df[col].value_counts().head(3).plot.pie(shadow=True,
                                           autopct='%1.1f%%',
                                           explode=explode,
                                           pctdistance=0.5,
                                           colors=colors,
                                           textprops={'fontsize':14})
    plt.tight_layout()

### From above plots we infer that :
*  For cat5 BI has the most value counts.
* For cat10 however,all values have almost equal value counts.

## **Exploring Numerical Columns**

### Histogram and box plot below show distributions of all numerical columns.

In [None]:
num_cols=df.select_dtypes(['float64'])
plt.style.use('seaborn-whitegrid')

for col in num_cols:
    fig,ax=plt.subplots(2,1,sharex=True,
                       gridspec_kw={'height_ratios':(0.25,0.75)})
    fig.set_size_inches(7,6)
    sns.boxplot(x=col,data=df,ax=ax[0])
    sns.histplot(x=col,data=df,ax=ax[1])
    ax[0].set_xlabel(col,fontsize=14)
    ax[1].set_xlabel(col,fontsize=14)
    ax[1].set_ylabel('Count',fontsize=14)
    ax[0].set_yticks([])
    sns.despine(ax=ax[1])
    sns.despine(ax=ax[0],left=True)

### Below heatmap illustrates correlation analysis of  a feature with the other features

In [None]:
#dropping the id column since it won't be of much help in finding the correlation.

corr=df.drop('id',axis=1).corr(method='pearson')
plt.figure(figsize=(10,8))
plt.title('Correlation Analysis',fontsize=16)
plt.xticks(rotation=90,fontsize=14)
plt.yticks(fontsize=14)

sns.heatmap(corr,annot=True,fmt='0.1f',
            robust=True,cmap='coolwarm')

### cont7 and cont10 have a strong positive correlation while the same case is with cont7 and cont0. Lastly cont2 and cont1 have a correlation of 0.86. Below are the scatterplots for these features.

In [None]:
col1=['cont0','cont7','cont10','cont1']
col2=['cont10','cont0','cont7','cont2']

fig=plt.figure()
plt.style.use('seaborn-darkgrid')
for i in range(4):
    fig.add_subplot(4,2,i+1)
    fig.set_size_inches(10,12)
    sns.scatterplot(x=col1[i],y=col2[i],data=df,
                    alpha=0.1,edgecolor='none')
    plt.tight_layout()

### Checking skewness of numerical columns

In [None]:
for col in num_cols:
    print(col)
    print('Skewness :',np.round(skew(df[col]),3))

In [None]:
cols=['cont8','cont9','cont10']
for col in cols:
    df[col]=np.log(df[col])

### Below we check the target feature and plot a count plot. We observe that class 0 has a greater count than that of class 1

In [None]:
plt.figure(figsize=(6,6))
sns.countplot(data=df,x='target')
plt.xlabel('target',fontsize=14)
plt.ylabel('Count',fontsize=14)

In [None]:
le=LabelEncoder()
for col in cat_cols:
    df[col]=le.fit_transform(df[col])

In [None]:
y=df.target.values
X=df.drop(['id','target'],axis=1).values

### Hyperparameter Optimization using Optuna
* Below we fit and train a LGBMClassifier using StratifiedKFold. 
* Lastly we return the mean of the AUC score obtained as a result of 5 splits

In [None]:
def optimize(trial,x,y):
    
    num_iterations=trial.suggest_int('num_iterations',100,1500)
    max_depth=trial.suggest_int('max_depth',3,15)
    num_leaves=trial.suggest_int('num_leaves',10,100)
    min_data_in_leaf=trial.suggest_int('min_data_in_leaf',1,100)
    min_sum_hessian_in_leaf=trial.suggest_int('min_sum_hessian_in_leaf',1,200)
    feature_fraction=trial.suggest_uniform('feature_fraction',1e-5,1.0)
    bagging_fraction=trial.suggest_uniform('bagging_fraction',1e-5,1.0)
    bagging_freq=trial.suggest_int('bagging_freq',1,10)
    lambda_l1=trial.suggest_uniform('lambda_l1',1e-5,5.0)
    lambda_l2=trial.suggest_uniform('lambda_l2',1e-5,10)
   
    model=LGBMClassifier(
        num_iterations=num_iterations,
        max_depth=max_depth,
        num_leaves=num_leaves,
        min_data_in_leaf=min_data_in_leaf,
        min_sum_hessian_in_leaf= min_sum_hessian_in_leaf,
        feature_fraction=feature_fraction,
        bagging_fraction=bagging_fraction,
        bagging_freq=bagging_freq,
        lambda_l1=lambda_l1,
        lambda_l2=lambda_l2
    )
    kf=StratifiedKFold(n_splits=5)
    AUC=[]
    for idx in kf.split(X=x,y=y):
        train_idx,test_idx=idx[0],idx[1]
        x_train,y_train=x[train_idx],y[train_idx]
        x_test,y_test=x[test_idx],y[test_idx]
       
        model.fit(x_train,y_train)
        preds=model.predict_proba(x_test)[:,1]
        fold_auc=roc_auc_score(y_test,preds)
        AUC.append(fold_auc)
        
    return -1*np.mean(AUC)

In [None]:
#optimization_function=partial(optimize,x=X,y=y)
#study=optuna.create_study(direction='minimize')

#study.optimize(optimization_function,n_trials=10)

In [None]:
lgbm=LGBMClassifier(
     num_iterations=1091,
     max_depth=13,
     num_leaves=22,
     min_data_in_leaf=82,
     min_sum_hessian_in_leaf=42,
     feature_fraction=0.1631559284100434,
     bagging_fraction=0.38583663547224584,
     bagging_freq=7,
     lambda_l1=0.054607760008535275,
     lambda_l2=0.4441933265076425,
     )

X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.3,stratify=y)
lgbm.fit(X_train,y_train)


In [None]:
lgbm_probs=lgbm.predict_proba(X_val)

# Plotting the roc curve for probability prediction
plt.style.use('seaborn-whitegrid')
n_probs=[0 for _ in range(len(y_val))]
lgbm_probs=lgbm_probs[:,1]
ns_auc=roc_auc_score(y_val,n_probs)
lgbm_auc=roc_auc_score(y_val,lgbm_probs)
print('ROC AUC:%.3f' %(lgbm_auc))
ns_fpr,ns_tpr,_=roc_curve(y_val,n_probs)
lgbm_fpr,lgbm_tpr,_=roc_curve(y_val,lgbm_probs)
plt.figure(figsize=(9,7))
plt.plot(ns_fpr,ns_tpr,linestyle='--',label='No skill')
plt.plot(lgbm_fpr,lgbm_tpr,'g-',linewidth=2.3,label='positive outcome')
plt.xlabel('False Positive Rate',fontsize=14)
plt.ylabel('True positive rate',fontsize=14)
plt.legend()

In [None]:
df_=df.copy()
df_=df_.drop(['id','target'],axis=1)

### Feature Importance Analysis

In [None]:
x=pd.DataFrame(lgbm.feature_importances_)
x.columns=['Feature Importance']
x.index=df_.columns
x=x.sort_values(by='Feature Importance',ascending=False)

plt.style.use('default')
plt.figure(figsize=(7,5))
sns.barplot(x='Feature Importance',y=x.index,data=x)
plt.xlabel('Feature Importance',fontsize=14)
plt.ylabel('Feature',fontsize=14)
plt.title('Feature Imporatnce Analysis',fontweight='bold',fontsize=10)
yticks=plt.yticks(fontsize=8)
xticks=plt.xticks(fontsize=8)

In [None]:
X_test=pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
X_test.head()

In [None]:
for col in cat_cols:
    X_test[col],_=X_test[col].factorize()

In [None]:
X_test['target']=lgbm.predict_proba(X_test.drop('id',axis=1))[:,1]
submission=pd.DataFrame({'id':X_test['id'],'target':X_test['target']})
submission.to_csv('my_submission.csv',index=False)