## Import Necessary Libraries
First off, we need to import several Python libraries such as numpy, pandas, matplotlib and seaborn.

In [None]:
# Import Libraries
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import sklearn

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import optuna
from optuna.integration import lightgbm


import pickle
import os

# Print versions of libraries
print(f"Numpy version : Numpy {np.__version__}")
print(f"Pandas version : Pandas {pd.__version__}")
print(f"Matplotlib version : Matplotlib {matplotlib.__version__}")
print(f"Seaborn version : Seaborn {sns.__version__}")
print(f"SkLearn version : SkLearn {sklearn.__version__}")

# Magic Functions for In-Notebook Display
%matplotlib inline

# Setting seabon style
sns.set(style='darkgrid', palette='colorblind')
plt.show()

# To see all column names & rows without being truncated
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Read in and Explore the Data 
It's time to read in our training and testing data using `pd.read_csv`, and take a first look at the training data using the `describe()` function.

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv', encoding='latin_1')
test  = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv', encoding='latin_1')
sub = pd.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv', encoding='latin_1')

In [None]:
train.head()

In [None]:
test.head()

## Exploratory Data Analysis
We're going to consider the features in the dataset and how complete they are. 

In [None]:
# Ref : https://www.kaggle.com/kirillklyukvin/playground-series-february-21

def simple_eda(df):
    
    """
    This function helps us with simple data analysis.
    We may explore the common information about the dataset, missing values, features distribution and duplicated rows
    """
    
    # applying info() method
    print('---')
    print('Common information')
    print('---')
    print()
    print(df.info())
    
    # missing values
    print()
    print('---')
    if df.isna().sum().sum() == 0:
        print('There are no missing values')
        print('---')
    else:
        print('Detected')
        display(df.isna().sum())
    
    
    # applying describe() method for categorical features
    print()
    print('---')
    print('Categorical columns')
    print('Total {}'.format(len(df.select_dtypes(include='object').columns)))
    print('---')
    display(df.describe(include = 'object'))
    
    # same describe() but for continious features
    print('---')
    print('Continuous columns')
    print('Total {}'.format(len(df.select_dtypes(include=['int', 'float']).columns)))
    print('---')
    display(df.describe())
    
    #checking for duplicated rows
    if df.duplicated().sum() == 0:
        print('---')
        print('There are no duplicates')
        print('---')
    else:
        print('---')
        print('Duplicates found')
        print('---')
        display(df[df.duplicated()])
    
    print()
    print('---')
    print('End of the report')

### EDA for Train set

In [None]:
simple_eda(train)

<p style="font-weight: bold;color:#FF4500">Highlights</p>

* Dataset comprises of 300000 observations and 26 fields.

* Feature 'Target' is the response variable and it takes continous values.

* Features 'cat0' to 'cat9' are categorical and features 'cont0' to 'cont13' are continues values. 

* Feature 'id' is unique values and useless, it will be removed later on.

* There are no missing values present in the dataset. 

### EDA for Test set

In [None]:
simple_eda(test)

## Seperate Categorical & Continous features

In [None]:
categorical_cols=['cat'+str(i) for i in range(10)]
continous_cols=['cont'+str(i) for i in range(14)]

print("Categorical columns", categorical_cols)
print("Continous columns", continous_cols)

In [None]:
train.columns

### Delete the useless columns

Deleting those columns which are not useful in predictive analysis because these variables are qualitative

In [None]:
# 'id' is just unique key column, and it will not helpfull for model building
ColsToReject=['id']

train.drop(ColsToReject, axis=1,inplace=True)
test.drop(ColsToReject, axis=1,inplace=True)

train.head()

## Visual Exploratory Data Analysis

### Distribution of Target

* If target variable's distribution is too skewed then the predictive modeling will not be possible.
* Bell curve is desirable but slightly positive skew or negative skew is also fine
* When performing Regression, make sure the histogram looks like a bell curve or slight skewed version of it. **Otherwise it impacts the Machine Learning algorithms ability to learn all the scenarios.**

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(train['target'])
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(data = train, x='target')
plt.show()

### Find and treat Outliers

In [None]:
# Create a function to return the outliers
def detect_outliers(x, c = 1.5):
    """
    Function to detect outlivers.
    """
    q1, q3 = np.percentile(x, [25,75])
    #print("q1 - ",q1, " q3 - ", q3)
    
    iqr = (q3 - q1)
    #print("iqr --", iqr)
    
    lob = q1 - (iqr * c)
    #print("lob - ",lob)
    
    uob = q3 + (iqr * c)
    #print("uob - ",uob)
    
    # Generate outliers
    indicies = np.where((x > uob) | (x < lob))

    return indicies


# Detect all Outliers 
priceOutliers = detect_outliers(train['target'])
print("Total Outliers count : ",len(priceOutliers[0]))

print("Shape before removing outliers : ",train.shape)

# Remove outliers
train.drop(priceOutliers[0],inplace=True)

print("Shape after removing outliers : ",train.shape)

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(train['target'])
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(data = train, x='target')
plt.show()

### Exploring categorical features

In [None]:
# Plotting multiple bar charts at once for categorical variables
def PlotBarCharts(inpData, colsToPlot, rows, cols):
    idx = 0
    f, axes = plt.subplots(rows,cols, sharex=True, figsize=(12,20))
    plt.suptitle('Categorical features distribution', size=16, y=(0.94))

    for row in range(rows):
        for col in range(cols):
            data = inpData[colsToPlot[idx]].value_counts()
            sns.barplot(x = data.values, y = data.index, palette='deep', ax=axes[row, col])
            axes[row,col].set_title(colsToPlot[idx])
            idx += 1

In [None]:
# Calling the function
PlotBarCharts(inpData=train, colsToPlot=categorical_cols, rows=5, cols=2)

### Exploring Continous features

In [None]:
# Plotting histograms of multiple columns together
train.hist(continous_cols, figsize=(18,16))
plt.show()

#### Histogram Interpretation
Histograms shows us the data distribution for a single continuous variable.

The X-axis shows the range of values and Y-axis represent the number of values in that range. For example, in the above histogram of "cont4", there are around 120000 rows in data that has a value between 0.3 to 0.4.

The ideal outcome for histogram is a bell curve or slightly skewed bell curve. If there is too much skewness, then outlier treatment should be done and the column should be re-examined, if that also does not solve the problem then only reject the column.

## Feature Selection
Now its time to finally choose the best columns(Features) which are correlated to the Target variable.
This can be done directly by measuring the correlation values or ANOVA/Chi-Square tests. However, it is always helpful to visualize the relation between the Target variable and each of the predictors to get a better sense of data.

### Correlation Among Explanatory Variables

Having too many features in a model is not always a good thing because it might cause overfitting and worser results when we want to predict values for a new dataset. Thus, if a feature does not improve your model a lot, not adding it may be a better choice.

Lets find out top 10 features which are highly correlaed with target.

In [None]:
train.corr()['target'].sort_values(ascending=False).head(10)

In [None]:
mask = np.zeros_like(train[continous_cols].join(train['target']).corr(), dtype=np.bool) 
mask[np.triu_indices_from(mask)] = True 

f, ax = plt.subplots(figsize=(10,10))
plt.title('Pearson Correlation Matrix',fontsize=25)

sns.heatmap(train[continous_cols].join(train['target']).corr(),linewidths=0.25,vmax=0.7,square=True,cmap="viridis",
            linecolor='w',annot=True,annot_kws={"size":8},mask=mask,cbar_kws={"shrink": .9})

plt.show()

We have a few multicollinear columns. 

### Statistical Feature Selection (Categorical Vs Continuous) using ANOVA test
Analysis of variance(ANOVA) is performed to check if there is any relationship between the given continuous and categorical variable
* Assumption(H0): There is NO relation between the given variables (i.e. The average(mean) values of the numeric Target variable is same for all the groups in the categorical Predictor variable)
* ANOVA Test result: Probability of H0 being true

In [None]:
# Defining a function to find the statistical relationship with all the categorical variables
def FunctionAnova(inpData, TargetVariable, CategoricalPredictorList):
    from scipy.stats import f_oneway

    # Creating an empty list of final selected predictors
    SelectedPredictors=[]
    
    print('##### ANOVA Results ##### \n')
    for predictor in CategoricalPredictorList:
        CategoryGroupLists=inpData.groupby(predictor)[TargetVariable].apply(list)
        AnovaResults = f_oneway(*CategoryGroupLists)
        
        # If the ANOVA P-Value is <0.05, that means we reject H0
        if (AnovaResults[1] < 0.05):
            print(predictor, 'is correlated with', TargetVariable, '| P-Value:', AnovaResults[1])
#             SelectedPredictors.append(predictor)
        else:
            print(predictor, 'is NOT correlated with', TargetVariable, '| P-Value:', AnovaResults[1])
    
#     return(SelectedPredictors)

In [None]:
# Calling the function to check which categorical variables are correlated with target
FunctionAnova(inpData=train, 
              TargetVariable='target', 
              CategoricalPredictorList=categorical_cols)

All categorical variables are correlated with the Target variable, so we will keep all categorical features for model building.

## Encode categorical features
Data Pre-processing for Machine Learning

In [None]:
train.head()

In [None]:
for e in categorical_cols:
    le = LabelEncoder()
    train[e]=le.fit_transform(train[e])
    test[e]=le.transform(test[e])
    

    
# # Treating all the nominal variables at once using dummy variables
# # DataForML_Numeric=pd.get_dummies(DataForML)

# train = pd.get_dummies(train)
# test = pd.get_dummies(test)

# Printing sample rows
train.head()

In [None]:
train.columns

In [None]:
# Separate Target Variable and Predictor Variables
# TargetVariable = 'target'
# Predictors = categorical_cols+continous_cols

# Separate Target Variable and Predictor Variables
TargetVariable = 'target'
Predictors = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6',
       'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13']

# X = trainDataForML[Predictors].values
# y = trainDataForML[TargetVariable].values

X = train[Predictors]
y = train[TargetVariable]


X_test = test.copy()
# y_test = testDataForML['target']

# Quick sanity check with the shapes of Training and testing datasets
print("X - ",X.shape)
print("y - ",y.shape)
print("X_test - ",X_test.shape)
# print("y_test - ",y_test.shape)

## Scaling

In [None]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

## Saving preprossed data as serialized files

To deploy the predictive models built we save them along with the required data files as serialized file objects
We save cleaned and processed input data, tuned predictive models as files so that they can later be re-used/shared

In [None]:
playgroundDataCleaned = X

# Saving the Python objects as serialized files can be done using pickle library
# Here let us save the Final Data set after all the transformations as a file
with open('playgroundDataCleaned.pkl', 'wb') as fileWriteStream:
    pickle.dump(playgroundDataCleaned, fileWriteStream)
    # Don't forget to close the filestream!
    fileWriteStream.close()
    
print('pickle file is saved at Location:',os.getcwd())

### Splitting data into Training and Validation samples

In [None]:
# Split the data into training and testing set
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=48)

# Quick sanity check with the shapes of Training and testing datasets
print("X_train - ",X_train.shape)
print("y_train - ",y_train.shape)
print("X_validation - ",X_validation.shape)
print("y_validation - ",y_validation.shape)

## Model Building

The below function uses LGBMRegressor model, takes
* the data
* the target
* trial(How many executions we will do) and returns
* RMSE(Root Mean Squared Rrror)

In [None]:
# !pip install optuna 

In [None]:
# Ref : https://www.kaggle.com/hamzaghanmi/lgbm-hyperparameter-tuning-using-optuna

def objective(trial,data=X,target=y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=48)
    param = {
        'metric': 'rmse', 
        'random_state': 48,
        'n_estimators': 1000,
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    model = LGBMRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=22)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print("\n")

In [None]:
study.trials_dataframe()

## Quick Visualization for Hyperparameter Optimization Analysis

Optuna provides various visualization features in optuna.visualization to analyze optimization results visually

In [None]:
#plot_optimization_histor: shows the scores from all trials as well as the best score so far at each point.
optuna.visualization.plot_optimization_history(study)

In [None]:
#Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

In [None]:
#Visualize empirical distribution function
optuna.visualization.plot_edf(study)

## LGBMRegressor model with the best hyperparameters

In [None]:
params=study.best_params 
print("Best Parameter : ", params)

In [None]:
params['random_state'] = 48
params['n_estimators'] = 20000 
params['metric'] = 'rmse'

# Changed min_data_per_groups to cat_smooth beacuse there is no parameter named min_data_per_groups in LGBM.
params['cat_smooth'] = params.pop('min_data_per_groups')

In [None]:
print("Best Parameter : ", params)

In [None]:
X_test.head()

In [None]:
columns = categorical_cols+continous_cols
preds = np.zeros(test.shape[0])
kf = KFold(n_splits=5,random_state=48,shuffle=True)
rmse=[]  # list contains rmse for each fold
n=0
for trn_idx, test_idx in kf.split(train[columns],train['target']):
    X_tr,X_val=train[columns].iloc[trn_idx],train[columns].iloc[test_idx]
    y_tr,y_val=train['target'].iloc[trn_idx],train['target'].iloc[test_idx]
    model = LGBMRegressor(**params)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
    preds+=model.predict(test[columns])/kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    print(n+1,rmse[n])
    n+=1

In [None]:
np.mean(rmse)

In [None]:
lightgbm.plot_importance(model, max_num_features=10, figsize=(10,10))
plt.show()

## Submission

In [None]:
sub['target']=preds
sub.to_csv('submission.csv', index=False)

## Reference:

* https://www.kaggle.com/hamzaghanmi/lgbm-hyperparameter-tuning-using-optuna#Let's-do-some-Quick-Visualization-for-Hyperparameter-Optimization-Analysis
* https://www.kaggle.com/kirillklyukvin/playground-series-february-21

##  If you find this notebook helpful, please upvote it.