In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

import sklearn.metrics
from sklearn.model_selection import train_test_split,KFold,cross_val_score,RandomizedSearchCV, GridSearchCV

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import QuantileTransformer

#### In the earlier notebook we did the [EDA](http://www.kaggle.com/saileshnair/tps202109-normal-and-quick-eda)

In [None]:
def plot_model_comparison(models,results,title):
    """ 
        Compares the results of different models and plots box plots for the algorithms.
        models: list of names of models
        results: training results
        title: title for the graph
        
    """
    fig = plt.figure()
    fig.suptitle(title)
    ax = fig.add_subplot(111)
    plt.boxplot(results)
    ax.set_xticklabels(models)
    plt.show()

def timer(start_time=None):
    """ 
        Helps  to keep track of time elapsed while training.
        start time: if none then start time tracking
                    if not none tracks time from start time         
    """
    from datetime import datetime
    if not start_time:
        print(datetime.now())
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print("Time taken: %i hours %i minutes and %s seconds." % (thour, tmin, round(tsec, 2)))

def getbounds(col):
    ''' 
    This function returns the upper bound and the lower bound using the IQR for the column "col".
    '''
    sorted(col)
    q1,q3 = np.percentile(col,[25,75]) # quartailes
    iqr = q3-q1 # inter quartile range
    lb = q1 -(1.5*iqr) # lower bound
    ub = q3 +(1.5*iqr) # upper bound
    return lb,ub

def plothists(df):
    '''
    we'll use this function to iteratively plot the histplot for all columns of the df
    '''
    nrows = 30
    ncols = 4
    i = 0
    fig, ax = plt.subplots(nrows, ncols, figsize = (40,120))
    for row in range(nrows):
        for col in range(ncols):
            if i==118:
                break
            else:
                sns.histplot(data = df.iloc[:, i], bins = 30, ax = ax[row, col]).set(ylabel = '')
                i += 1

#### In this notebook I'm trying out how to transform the data into a normal distribution


In [None]:
train=pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")

In [None]:
features = train.columns[1:-1]
df=train[features].copy()

In [None]:
df.shape

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

## 3. Prepare Data
## a) Data Cleaning
## b) Data Transforms

### First let's take a look at imputation of the null values

#### The Simple Imputer is the quickest imputer. Other I have tried are
+ KNNImputer() -  this uses the KNN to impute missing values
+ IterativeImputer(random_state=21) - it is an experimental implementation of imputer in scikit learn - time consuming

#### Simple Imputer

In [None]:
si=SimpleImputer(strategy='median',copy=False)
si.fit_transform(df)
idf=pd.DataFrame(data=df,columns=features)

In [None]:
idf.describe().T

In [None]:
idf.isnull().sum().sum()

#### We've taken care of the null values. 
#### Now to normalize we could use transformers like RobustScaler, PowerTransformer,  QuantileTransformer. In this Notebook I use QuantileTransformer.

+ **RobustScaler** This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range). The IQR is the range between the 1st quartile (25th quantile) and the 3rd quartile (75th quantile).
+ **Power Transformer + yeo-johnson method**:  Yeo-Johnson supports both positive or negative data.
+ **Power Transformer + Box Cox method**:  Box-Cox requires input data to be strictly positive. First we need to treat oultliers. I capped the ouliers at the bounds. To remove the negative values I had to explicitely square the values for features containing negative values before applying this transformation.

In [None]:
#before transformation
plothists(idf)

**QuantileTransformer** This method transforms the features to follow a uniform or a normal distribution. Therefore, for a given feature, this transformation tends to spread out the most frequent values. It also reduces the impact of (marginal) outliers: this is therefore a robust preprocessing scheme.

In [None]:
qt=QuantileTransformer(
    n_quantiles=1000, 
    random_state=21,
    output_distribution= 'normal',
    copy=False)
qt.fit_transform(idf)

In [None]:
qtidf=pd.DataFrame(data=idf,columns=features)

In [None]:
#after transformation
plothists(qtidf)

#### Now that the data is normalized, we can fit a model and check predictions. 

## 4. Evaluate Algorithms
## a) Split-out validation dataset
## b) Test options and evaluation metric
## c) Compare Algorithms

In [None]:
X=qtidf.values
Y=train[['claim']].values

In [None]:
scorer = "roc_auc"
splits = 5
seed = 21

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.20,  random_state=seed)

In [None]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

In [None]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('SGD', SGDClassifier(random_state=21)))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(("QDA", QuadraticDiscriminantAnalysis()))
models.append(('CART', DecisionTreeClassifier(max_depth=10,max_features = 10)))
models.append(('NB', GaussianNB()))
models.append(("Neural Net", MLPClassifier(alpha=1, max_iter=1000)))
models

In [None]:
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=splits, shuffle=True,  random_state=seed)
    start_time=timer(None)
    cv_results = cross_val_score (model, x_train, y_train, cv=kfold,  scoring=scorer)
    timer(start_time)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %5.2f (%5.2f)" % (name, cv_results.mean()*100, \
                           cv_results.std()*100)
    print(msg)
results_df = pd.DataFrame(results, index=names, \
                          columns='CV1 CV2 CV3 CV4 CV5 '.split())
results_df['CV Mean'] = results_df.iloc[:,0:splits].mean(axis=1)
results_df['CV Std Dev'] = results_df.iloc[:,0:splits].std(axis=1)
results_df.sort_values(by='CV Mean', ascending=False)*100

In [None]:
title="Algorithms Comparison"
plot_model_comparison(names,results,title)

### The best model seems to be Gaussian Naive Bayes. Lets do a submission and check

## Hyperparameter Tuning the Gaussian Naive Bayes Classifier

In [None]:
param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=100)
}

In [None]:
start_time=timer(None)
nb_grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, verbose=1, cv=10, n_jobs=-1)
nb_grid.fit(x_train, y_train)
timer(start_time)
print(nb_grid.best_estimator_)

In [None]:
y_pred = nb_grid.predict(x_test)
y_pred

In [None]:
print(y_pred.shape,y_test.shape)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred), ": is the accuracy score")
from sklearn.metrics import precision_score
print(precision_score(y_test, y_pred), ": is the precision score")
from sklearn.metrics import recall_score
print(recall_score(y_test, y_pred), ": is the recall score")
from sklearn.metrics import f1_score
print(f1_score(y_test, y_pred), ": is the f1 score")

In [None]:
testdf=pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
sub_df=pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")

In [None]:
print(testdf.shape,sub_df.shape)

In [None]:
testdf.drop(columns="id",inplace=True)
testdf.head(2)

In [None]:
si.transform(testdf)
qt.transform(testdf)

In [None]:
test=testdf.values

In [None]:
preds = nb_grid.predict_proba(test)
print(preds.shape)
preds

In [None]:
sub_df['claim'] = preds[:,0]
sub_df.head()

In [None]:
sub_df.to_csv("submission_gnb.csv",index=False)

### Kindly let me know if the approach , transformers, imputers, Algorithm Comparison were helpful for you with an upvote or comment to improve my understanding