In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Libraries:

## For Pre-Processing:

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go # interactive plotting library
import plotly.express as px # interactive plotting library
from plotly.subplots import make_subplots
!pip install RapidPlot  # Library that I created. Only Contains 1 classs with 4 functions till now ;)
import RapidPlot
from IPython.display import display
!pip install ppscore # installing ppscore, library used to check non-linear relationships between our variables
import ppscore as pps # importing ppscore
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## For Handling Imbalance:

In [None]:
!pip install imblearn
from scipy.stats import describe
from imblearn.over_sampling import RandomOverSampler,SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.metrics import classification_report_imbalanced
from imblearn.pipeline import Pipeline as imb_Pipeline
from imblearn.base import SamplerMixin

## For Model-Selection

In [None]:
# Models:

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
!pip install catboost
from catboost import CatBoostClassifier

# Metrics

from sklearn.metrics import accuracy_score, roc_curve, f1_score, precision_score, recall_score, confusion_matrix


# Model Selection:

from sklearn.pipeline import Pipeline as sk_Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin

# Library for plotting confusion matrix
from mlxtend.plotting import plot_confusion_matrix

#### Loading the dataset: 

In [None]:
ci_df = pd.read_csv("../input/caravan-insurance-challenge/caravan-insurance-challenge.csv")
ci_df.head()

#### Extracting Train and Test indices:

In [None]:
train_index = ci_df.ORIGIN == 'train'
test_index = ci_df.ORIGIN == 'test'
ci_df.drop(columns=['ORIGIN'], inplace=True)

# Pre-processing:

### Checking for null-values:

In [None]:
(ci_df.isnull() == True).sum().sum()    # Checking total null-values in the dataset

#### No null values

### Checking for Imbalance:

In [None]:
ci_df.CARAVAN.value_counts()

In [None]:
scaler = StandardScaler()
pca = PCA(n_components=2)
ci_df_red = pca.fit_transform(scaler.fit_transform(ci_df.drop(columns=('CARAVAN'))))

In [None]:
fig = px.scatter(x=ci_df_red[:, 0], y=ci_df_red[:, 1], color=ci_df.CARAVAN)
fig.show()

Yupp, We were are correct. The data we have is highly skewed

#### We will handle this Imbalance using before feeding to the model

## Performing Feature-Selection:

Remember to not use **PCA** for Feature-Selection.
Look at this blog for more info: https://towardsdatascience.com/pca-is-not-feature-selection-3344fb764ae6

### Checking Correlations:

We can perform feature selection using two methods:
    1. Pearson's Correlation
    2. PPscore (Predictive Power Score)
    
***Limitations of Pearson's corr:***
<br>
It is not usefull for non-linear data (eg: y = $x^2$), this is taken care of by PPscore
<br><br>
***Limitations of PPscore:***
<br>
Calculating PPscore for a large set of features will be take a lot longer than Pearson's corr.<br>
Also, PPscore doesn't tell you about the type of relation (eg: directly proportional, inversely proportional), Hence, for linear relations, using Pearson's corr is a lot more useful

For more info check:<br>
https://towardsdatascience.com/rip-correlation-introducing-the-predictive-power-score-3d90808b9598

Go through the entire blog, to understand when and when not to use PPscore

In [None]:
# for execution time uncomment the below line
#%timeit ci_df_corr = ci_df.corr()

# and comment this line
ci_df_corr = ci_df.corr()

fig = px.imshow(ci_df_corr)
fig.show()

#### We can clearly see two divides in the heatmap: upper-left & lower-right<br>
#### Let's zoom in

Keep the result of _timeit_ in mind, below we will also see for PPscore.

In [None]:
fig, axs = plt.subplots(figsize=(20, 7), ncols=2)

sns.heatmap(ci_df_corr.iloc[:43, :43], cmap='YlGnBu', ax=axs[0])
sns.heatmap(ci_df_corr.iloc[43:-1, 43:-1], ax=axs[1])

axs[0].set_title('Upper Left')
axs[1].set_title('Lower Right')

fig.show()

We can see in _"Lower Right"_ Heatmap, that feature with names starting with 'A' and 'P' are very much similar.<br> (eg: $r$ = 0.9647738 for PBYSTAND & ABYSTAND, so we can obtain PBYSTAND from ABYSTAND or vice-versa)<br> Hence, having both present in the dataset is useless.

Let's remove the one's starting with 'A' (psst, my name start's with 'P' :P)

In [None]:
needed_columns = ci_df_corr.columns[~pd.Series(ci_df_corr.columns).str.startswith('A')]

needed_corr = ci_df_corr.loc[needed_columns, needed_columns]

fig = px.imshow(needed_corr)
fig.show()

In [None]:
corr_with_target = needed_corr.iloc[-1, :-1]
print(corr_with_target)
corr_with_target.max()

In [None]:
pearson_feature_vector = corr_with_target[np.abs(corr_with_target) >=0.05]
print(pearson_feature_vector.shape)
pearson_feature_vector

##### Using Pearson's correlation, we got the above feature as important. Now, let's check using PPscore 

#### PPscore:

In [None]:
# for execution time uncomment the below line
#%timeit ci_df_pps = pps.matrix(ci_df)

# and comment this line
ci_df_pps = pps.matrix(ci_df)

ci_df_pps

We can see that Pearson's corr is approx 700x faster than pps, hence, for very fat datasets (large number of features) take care in using PPscore

In [None]:
pps_val_matrix = pd.DataFrame(np.array(ci_df_pps.ppscore).reshape(len(ci_df.columns), len(ci_df.columns)),
                              index=ci_df.columns,
                              columns=ci_df.columns)
pps_val_matrix

In [None]:
fig = px.imshow(pps_val_matrix)
fig.show()

#### Important: Unlike Correlation matrix, PPscore matrix are not symmetric, the main reason for Invention of PPscore is to handle asymmetric nature of real-world data

Doing same for columns starting with name 'A' and 'P', like we in Pearson's Corr.

From this we can already see that PPscore is not looking good for our target variable.

In [None]:
needed_pps = pps_val_matrix.loc[needed_columns, needed_columns]

fig = px.imshow(needed_pps)
fig.show()

In [None]:
pps_with_target = pps_val_matrix.iloc[-1, :-1]
pps_with_target

In [None]:
pps_with_target.max()

In [None]:
pps_feature_vector = pps_with_target[pps_with_target != 0]
print(pps_feature_vector.shape)
pps_feature_vector

#### Now that we have selected the feature using both the methods - Pearson's corr and PPscore, we will feed these feature vectors along with a vector containing all the feature into GridSearchCV. This is because many of the times it can happen, that certain features will provide unnecessary noise to the model instead of contributing towards better prediction. Now, next step is Imbalance Handling.

# Handling Imbalance:

Before handling imbalance, let's first create train and test sets:

In [None]:
# Setting random state
rnd_state = 42

In [None]:
X = ci_df.drop(columns='CARAVAN')
y = ci_df.CARAVAN

X_train = X[train_index]
y_train = y[train_index]
X_test = X[test_index]
y_test = y[test_index]

In [None]:
for name, data in zip(('X_train', 'y_train', 'X_test', 'y_test'), (X_train, y_train, X_test, y_test)):
    print(name)
    print(f'Shape: {data.shape}')
    if len(data.shape) == 1:
        print('Balance Stats:')
        print(data.value_counts())
        print('\n')

In [None]:
# Creating Instances:

ros = RandomOverSampler(random_state=rnd_state)
rus = RandomUnderSampler(random_state=rnd_state)
smote_sampler = SMOTE(random_state=rnd_state, n_jobs=-1)
adasyn_sampler = ADASYN(random_state=rnd_state, n_jobs=-1)

In [None]:
# Handling Imbalance:

X_under,y_under = rus.fit_sample(X_train,y_train)
X_over,y_over = ros.fit_sample(X_train,y_train)
X_smote,y_smote = smote_sampler.fit_sample(X_train,y_train)
X_adasyn, y_adasyn = adasyn_sampler.fit_sample(X_train, y_train)

# In the paper on smote it is sugested to first undersample, then use smote, let's do that
X_under_smote, y_under_smote = smote_sampler.fit_sample(X_under, y_under)
X_under_adasyn, y_under_adasyn = adasyn_sampler.fit_sample(X_under, y_under)

In [None]:
data_index = ['Unbalanced','Undersample','Oversample','SMOTE', 'ADASYN', 'Undersample then SMOTE', 'Undersample then ADASYN']

imbalance_handling_stats = pd.DataFrame([*map(lambda x:describe(x)._asdict(),[y_train,y_under,y_over,y_smote, y_adasyn, y_under_smote, y_under_adasyn])],index=data_index)
imbalance_handling_stats

We will be creating a pipeline and feeding the different Oversampling Handlers to the Grid Search

# Model Selection:

#### We will also add a transformer which will manipulate the feature, according to Pearson's corr Feature vector, PPscore feature vector or complete feature vector.

In [None]:
class FeatureTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, feature_vector=None):
        self.feature_vector = feature_vector
        
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return X[self.feature_vector]

In [None]:
rnd_state=3

model_list = [LogisticRegression(random_state=rnd_state),
              SGDClassifier(random_state=rnd_state),
              SVC(random_state=rnd_state),
              KNeighborsClassifier(),
              GaussianNB(),
              DecisionTreeClassifier(random_state=rnd_state),
              RandomForestClassifier(random_state=rnd_state),
              GradientBoostingClassifier(random_state=rnd_state),
              
              # The below three models are extremely powerful but require extensive hyperparameter tuning
              # Hence, they might not perform well here
              XGBClassifier(random_state=rnd_state), 
              LGBMClassifier(random_state=rnd_state),
              CatBoostClassifier(random_state=rnd_state),
             ]

In [None]:
Main_Pipeline = imb_Pipeline([
    ('feature_handler', FeatureTransformer(list(pearson_feature_vector.index))),
    ('over', SMOTE()),
    ('under', RandomUnderSampler()),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.99)),
    ('model', LogisticRegression()),
])

In [None]:
"""params_grid = [
    
    {
        'over': [ros, smote_sampler, adasyn_sampler],
        'feature_handler__feature_vector': list(map(list, [pearson_feature_vector.index, pps_feature_vector.index])),
        'pca__n_components': [1],
        'model': [LogisticRegression()], 
        'model__C': np.random.uniform(0.5, 2, 10),
        'model__max_iter': np.random.randint(1500, 2000, 2),
    },
    
    {
        'over': [ros, smote_sampler, adasyn_sampler],
        'feature_handler__feature_vector': list(map(list, [pearson_feature_vector.index, pps_feature_vector.index])),
        'pca__n_components': [1],
        'model': [SGDClassifier()],
        'model__alpha': np.random.uniform(0, 1, 10),
        'model__max_iter': np.random.randint(1500, 2000, 2),
    },
    
    {
        'over': [ros, smote_sampler, adasyn_sampler],
        'feature_handler__feature_vector': list(map(list, [pearson_feature_vector.index, pps_feature_vector.index])),
        'pca__n_components': [1],
        'model': [SVC()],
        'model__C': np.random.uniform(0, 2, 10),
        'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
    },
    
    {
        'over': [ros, smote_sampler, adasyn_sampler],
        'feature_handler__feature_vector': list(map(list, [pearson_feature_vector.index, pps_feature_vector.index])),
        'pca__n_components': [1],
        'model': [KNeighborsClassifier()],
        'model__n_neighbors': np.random.randint(2, 10, 7),
        'model__weights': ['uniform', 'distance'],
    },
    
    {
        'over': [ros, smote_sampler, adasyn_sampler],
        'feature_handler__feature_vector': list(map(list, [pearson_feature_vector.index, pps_feature_vector.index])),
        'pca__n_components': [1],
        'model': [GaussianNB()],
    },
 
    {
        'over': [ros, smote_sampler, adasyn_sampler],
        'feature_handler__feature_vector': list(map(list, [pearson_feature_vector.index, pps_feature_vector.index])),
        'pca__n_components': [1],
        'model': [DecisionTreeClassifier()],
        'criterion': ['gini', 'entropy'],
        'model__max_iter': np.random.randint(1500, 2000, 2),
        'model__max_depth': np.r_[np.random.randint(5, 15, 7), None],
    },
    
    {
        'over': [ros, smote_sampler, adasyn_sampler],
        'feature_handler__feature_vector': list(map(list, [pearson_feature_vector.index, pps_feature_vector.index])),
        'pca__n_components': [1],
        'model': [RandomForestClassifier()],
        'model__n_estimators': np.random.randint(90, 200, 10),
        'model__criterion': ['gini', 'entropy'],
        'model__max_depth': np.r_[np.random.randint(5, 15, 7), None],
    },
    
    {
        'over': [ros, smote_sampler, adasyn_sampler],
        'feature_handler__feature_vector': list(map(list, [pearson_feature_vector.index, pps_feature_vector.index])),
        'pca__n_components': [1],
        'model': [GradientBoostingClassifier()],
        'model__loss': ['deviance', 'exponential'],
        'model__n_estimators': np.random.randint(90, 200, 10),
        'model__learning_rate': np.random.uniform(0.00001, 1, 5),     # Remember, There is trade-off between learning_rate and n_estimators
    },
    
    {
        'over': [ros, smote_sampler, adasyn_sampler],
        'feature_handler__feature_vector': list(map(list, [pearson_feature_vector.index, pps_feature_vector.index])),
        'pca__n_components': [1],
        'model': [XGBClassifier()],
    },
    
    {
        'over': [ros, smote_sampler, adasyn_sampler],
        'feature_handler__feature_vector': list(map(list, [pearson_feature_vector.index, pps_feature_vector.index])),
        'pca__n_components': [1],
        'model': [LGBMClassifier()],
    },
    
    {
        'over': [ros, smote_sampler, adasyn_sampler],
        'feature_handler__feature_vector': list(map(list, [pearson_feature_vector.index, pps_feature_vector.index])),
        'pca__n_components': [1],
        'model': [CatBoostClassifier()],
    }
]"""


# param_grid for fast execution
params_grid = [{
    
    'over': [ros, smote_sampler, adasyn_sampler],
    'feature_handler__feature_vector': list(map(list, [X.columns, pearson_feature_vector.index, pps_feature_vector.index])),
    'model': model_list,  
}]


In [None]:
main_grid_f1 = GridSearchCV(Main_Pipeline, params_grid, scoring='f1', cv=2, verbose=2)
main_grid_f1.fit(X_train, y_train)

**Ques: What if n_components=1 in PCA?<br>
Ans: All the models except SVC(kernel='precomputed') will work perfectly fine. This is because 'precomputed' kernel requires a square matrix.**

In [None]:
main_grid_f1.best_estimator_

In [None]:
f1_grid_results = pd.DataFrame(main_grid_f1.cv_results_)

In [None]:
f1_grid_results

In [None]:
y_pred = main_grid_f1.predict(X_test)
print(f1_score(y_test, y_pred))
accuracy_score(y_test, y_pred)

In [None]:
confu_matr = confusion_matrix(y_test, y_pred)

plot_confusion_matrix(conf_mat=confu_matr);

In [None]:
main_grid_acc_s = GridSearchCV(Main_Pipeline, params_grid, scoring='accuracy', cv=2, verbose=2)
main_grid_acc_s.fit(X_train, y_train)

In [None]:
main_grid_acc_s.best_estimator_

In [None]:
acc_grid_result = pd.DataFrame(main_grid_acc_s.cv_results_)

In [None]:
y_pred = main_grid_acc_s.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
confu_matr = confusion_matrix(y_test, y_pred)

plot_confusion_matrix(conf_mat=confu_matr);

In [None]:
f1_score(y_test, y_pred)

The F1_score for Model selected using 'accuracy' as metric is abysmal. This is because the dataset is Severely Imbalanced, which causes poor sampling, even if use SMOTE and ADASYN.

### THANK YOU!