**Basic imports**

In [None]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dataset = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/test.csv")
submission=pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/sample_submission.csv")

In [None]:
dataset.head()

In [None]:
dataset.describe()

In [None]:
dataset.info()

In [None]:
dataset.columns[dataset.isnull().sum()!=0]

# Target Variable(category): balanced or imbalanced data?

In [None]:
column_name='target'
value_count = dataset[column_name].value_counts(sort=True).reset_index().rename(columns={column_name:"Value Count","index":column_name}).set_index('target')
value_count['percentage']=dataset[column_name].value_counts(sort=True,normalize=True)*100
value_count=value_count.reset_index()

In [None]:
value_count.set_index('target').plot.pie(y='Value Count',figsize=(10,7),legend=False,ylabel="")
plt.show()

In [None]:
float_features = [col for col in dataset.columns if dataset[col].dtypes == 'float64']

# Data is distributed Normally/Gaussian/Bell-shape or skewed

In [None]:
#histogram and kde of dataset : all floats are normally distributed
#sns.set_theme()
plt.rcParams['axes.facecolor'] = '#000000'   #dark
fig,axs = plt.subplots(4,4,figsize=(25,25))
for col,ax in zip(float_features,axs.ravel()):
    ax.hist(dataset[col],density=True,bins=100)
    ax.set_title(f'Train:{col},std:{dataset[col].std():.1f}',fontsize=20)
plt.show()

## Mutual Information between floats and target: -

**MI**, Correlation will only tells about linear relationship while MI tells about anykind of relationship.

In [None]:
def plot_mutual_information_curve(df,features,ncols=4,nrows=4,quantile=True,MI=True):
    def H(p):
        "Entropy of a binary random variables in nat(unit)"
        return -np.log(p)*p - np.log(1-p)*(1-p)
    fig,axs = plt.subplots(nrows,ncols,figsize=(25,25),sharey=True)
    for col,ax in zip(features,axs.ravel()):
        temp_df = pd.DataFrame({col:df[col].values,'target':df['target'].values})
        temp_df = temp_df.sort_values(col)
        temp_df.reset_index(inplace=True)
        rolling_mean = temp_df['target'].rolling(10000,center=True,min_periods=1).mean()
        if quantile:
            ax.scatter(temp_df.index,rolling_mean)
            ax.grid()
        else:
            ax.scatter(temp_df[col],rolling_mean)
            ax.grid()
        if MI and quantile:
            ax.set_xlabel(f"{col} MI = {H(temp_df['target'].mean()) - H(rolling_mean[rolling_mean!=None].values).mean():.5f}",fontsize=20)
        else:
            ax.set_label('f{col}',fontsize=20)
    plt.suptitle('MI relation between target and other independent variables',y=1.0,fontsize=25)
    fig.tight_layout(h_pad=1.0)
    plt.show()
    

In [None]:
plot_mutual_information_curve(dataset,float_features)

**Insight:** plot among variables show relationship is not linear, many variable has non-linear relationship with target, can't use linear training model.

In [None]:
int_features = [col for col in dataset.columns if dataset[col].dtypes=='int64' and col!='id' and col!='target']

## from below visualizations we can observe that many integer features have value range between 0-14 but only 7-8 have been seen in dataset, remaining are rare, we can consider it as rare label problem and populate it with next number in sequence to make more sense.plot_mutual_information_curve(dataset,int_features)

## from below visualizations we can observe that many integer features have value range between 0-14 but only 7-8 have been seen in dataset, remaining are rare, we can consider it as rare label problem and populate it with next number in sequence to make more sense.

**So, If consider this int Features as categorical features and replace those rare label than it will make more sense and give better analysis.**


In [None]:
def plot_categories(df,features,i=0):
    while i<len(features):        
        df_counts = len(df)
        fig,axs = plt.subplots(1,2,figsize=(15,5))
        temp_df = pd.Series(df[features[i]].value_counts()/df_counts)
        axs[0].bar(temp_df.index,temp_df.sort_values(ascending=False))
        #fig = temp_df.sort_values(ascending=False).plot.bar()
        axs[0].set_xlabel(features[i])
        axs[0].axhline(y=0.015,color='red',linewidth=3)
        axs[0].set_ylabel('percentage of total count')
        axs[0].set_title('different label\'s frequency ')

        temp_df = pd.Series(df[features[i+1]].value_counts()/df_counts)
        axs[1].bar(temp_df.index,temp_df.sort_values(ascending=False))
        #fig = temp_df.sort_values(ascending=False).plot.bar()
        axs[1].set_xlabel(features[i+1])
        axs[1].axhline(y=0.015,color='red',linewidth=3)
        axs[1].set_ylabel('percentage of total count')
        axs[1].set_title('different label\'s frequency ')
        plt.show()
        i+=2


In [None]:
plot_categories(dataset,int_features)

In [None]:
#before grouping rare labels
def categories_target_count_before_grouping(df,col):
    return df.groupby([col])['target'].value_counts()

In [None]:
def group_rare_labels(df,col):
    df_counts = len(df)
    temp_df = pd.Series(df[col].value_counts()/df_counts)
    grouping_dict = { k : temp_df[temp_df<=0.015].index[0] if k not in temp_df[temp_df>0.015].index else k
                     for k in temp_df.index }
    
    temp = df[col].map(grouping_dict)
    return temp 

In [None]:
def categories_target_count_after_grouping(df,col):
    return df.groupby([col])['target'].value_counts()

In [None]:
for col in int_features:
    dataset[col]=group_rare_labels(dataset,col)
    test[col]=group_rare_labels(test,col)

In [None]:
plot_categories(dataset,int_features)

## Column>> f_27 String feature to individual character list as it comprimse with set of Characters only

In [None]:
print(dataset['f_27'].str.len().min(),dataset['f_27'].str.len().max())

In [None]:
dataset['f_27'].value_counts()

In [None]:
uniq_chars = dataset['f_27'].apply(lambda s:len(set(s))).rename('uniq_chars')

In [None]:
# From https://www.kaggle.com/ambrosm/tpsmay22-eda-which-makes-sense
for df in [dataset,test]:
    for i in range(10):
        df[f'ch{i}'] = df['f_27'].str.get(i).apply(ord)-ord('A')
    df['uniq_chars'] = df['f_27'].apply(lambda s: len(set(s)))

In [None]:
plot_mutual_information_curve(dataset,[col for col in dataset.columns if col.startswith('ch') ]+['uniq_chars'],4,3)

In [None]:
figure=plt.figure(figsize=(25,25))
for idx,col in enumerate(int_features):
    plt.subplot(4,4,idx+1)
    ax=plt.gca()# to get single axis
    value_count = dataset[col].value_counts()
    ax.bar(value_count.index,value_count)
    ax.set_xlabel(f'Train:{col}',fontsize=20)
plt.suptitle('Integer Features Distribution: Normal or Not?',y=1.0,fontsize=25)
figure.tight_layout(h_pad=1.0)
plt.show()
    

In [None]:
plt.subplots(figsize=(25,25))
sns.heatmap(dataset.corr(),annot=True,fmt='0.2f',cmap='RdYlGn',vmin=-1,vmax=1,cbar=False)
plt.show()

In [None]:
dataset.columns,test.columns

In [None]:
dataset.describe()

# Column f_28

**this is the only column which has value range from -1200 to 1200, to avoid anykind of dominancy over other features, let's scaled this column**

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# only this column has different scale which can dominante
scaled_f_28 = scaler.fit_transform(dataset[['f_28']])
test_scaled_f_28 = scaler.transform(test[['f_28']])

In [None]:
dataset['f_28_scaled']=pd.Series(scaled_f_28.round(5).reshape((scaled_f_28.shape[0])))
test['f_28_scaled']=pd.Series(test_scaled_f_28.round(5).reshape((test_scaled_f_28.shape[0])))

In [None]:
%%time
fig,ax = plt.subplots(1,2,figsize=(15,5))
sns.histplot(dataset['f_28'],ax=ax[0],kde=True,legend=False,color='w')
ax[0].set_title('Original Data')
sns.histplot(dataset['f_28_scaled'],ax=ax[1],kde=True,legend=False,color='y')
ax[1].set_title('Scaled Data')
#ax[1].set_xlabel('f_08')
plt.show()

In [None]:
dataset=dataset.drop(columns=['f_28'],axis=1)
test = test.drop(columns=['f_28'],axis=1)

In [None]:
dataset.rename(columns={'f_28_scaled':'f_28'},inplace=True)
test.rename(columns={'f_28_scaled':'f_28'},inplace=True)

In [None]:
dataset.columns

## Train and Test Dataset

In [None]:
dataset.columns,test.columns

In [None]:
X = dataset.drop(columns=['f_27','id','target'],axis=1)
test = test.drop(columns=['f_27','id'],axis=1)
Y = dataset['target']

In [None]:
X.shape,test.shape,Y.shape

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_val,Y_train,Y_val = train_test_split(X,Y,test_size=0.2,shuffle=True,random_state=42)

In [None]:
gc.collect()
X_train.shape,X_val.shape,Y_train.shape,Y_val.shape

In [None]:
from lightgbm import LGBMClassifier

# LGBM Hyper Parameter Tunning: -

**for faster training on gpu can use: device_type='gpu',
                                    gpu_device_id=0,
                                    gpu_platform_id=1 parameters.
other parameters can be optimized and change respective of fit results.**                                    
                                   

In [None]:
classifier = LGBMClassifier(boosting_type='goss',learning_rate=0.05,
                            n_estimators=5000,min_child_samples=80,
                            metric='AUC',
                            subsample=0.6,
                            feature_fraction=0.6,
                            reg_lambda=1.81011,
                            reg_alpha=1.0595,
                            max_depth=11,
                            num_leaves=2048,
                            min_child_weight=5,
                            random_state=42,verbose=1)

In [None]:
classifier.fit(X_train.values,Y_train,eval_set=[(X_train,Y_train),(X_val,Y_val)],
               early_stopping_rounds=100)

In [None]:
import pickle 

In [None]:
pickle_filename = 'TPSMay22_model.pkl'
with open(pickle_filename,'wb') as file:
    pickle.dump(classifier,file)

In [None]:
feature_importance=pd.DataFrame(sorted(zip(classifier.feature_importances_,X.columns),reverse=True),columns=['value_count','features'])

In [None]:
fir,ax = plt.subplots(figsize=(25,25))
sns.barplot('value_count','features',data=feature_importance,ax=ax)
ax.set_ylabel('features',rotation=45,fontsize=25)
ax.set_xlabel('value counts',fontsize=25)
plt.show()

In [None]:
training_round=len(classifier.evals_result_['valid_0']['auc'])

In [None]:
plt.rcParams['axes.facecolor'] = '#eafff5'
fig,ax=plt.subplots(figsize=(10,10))
plt.scatter(range(training_round),classifier.evals_result_['valid_0']['auc'],label='Training accuracy',alpha=0.5)
plt.scatter(range(training_round),classifier.evals_result_['valid_1']['auc'],label='Testing accuracy',alpha=0.5)
plt.grid(True)
plt.xlabel('Training Rounds')
plt.ylabel('Accuracy')
ax.tick_params(labelcolor='tab:orange',labelsize='large')
plt.title('Train vs Test Accuracy Curve AUC') 
plt.legend()
plt.show()

In [None]:
Y_val_pred=classifier.predict_proba(X_val.values)

In [None]:
Y_val_pred=Y_val_pred[:,1]

In [None]:
from sklearn.metrics import roc_auc_score,roc_curve

In [None]:
score = roc_auc_score(Y_val, Y_val_pred)

In [None]:
score

In [None]:
test_pred = classifier.predict_proba(test.values)[:,1]

In [None]:
submission['target'] = test_pred

In [None]:
submission.shape

In [None]:
submission.to_csv('sample_submission.csv',index=False)