In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# OBJECTIVE : 

[1. Check usability of Complex Models in a Production scenario.](https://www.kaggle.com/nishantrock/do-complex-models-really-serve-our-purpose/edit/run/66865909)

[2. Check the effect of different metrics while refitting a model.](https://www.kaggle.com/nishantrock/do-complex-models-really-serve-our-purpose/edit/run/66865909)
   ( Using Mathew's corelation coefficient and F1 score, to balance out TN and FP )
   
[3. Using Transformations to tweak individual columns and use them with complex models.](https://www.kaggle.com/nishantrock/do-complex-models-really-serve-our-purpose/edit/run/66865909)

### Few Observations : 

- I tried both Linear and Complex models. 
Linear models gave me a balanced score with reference to Confusion Matrix. 
Complex Models such as XGBoost and Random Forest , ensemble of simple models gave me a much skewed Prediction. 
Either the FP or TN prediction was increased significantly, but my score was considerably improved. 

- I tried refitting the models with respect to the metrics such as mathew correlation coefficient and f1 score, to balance the FP and TN predictions.
It gave me a better prediction but it did not improve my score

- I transformed the individual features via pipelines but still was not able to achieve a good score via Linear Model. 

My objective is to use the Linear Model since the Complex Model give me a skewed predictions. 


# IMPORTANT THOUGHT : 

Metrics are for Humans to interpret the model. 
Loss functions are for Computers / Model to interpret how they are doing. 
Try factoring that in. 

In [None]:
pip install --upgrade scikit-learn


In [None]:
pip install sklego

In [None]:
import seaborn as sns
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

In [None]:
train = pd.read_csv('/kaggle/input/jobathon-may-2021-credit-card-lead-prediction/train.csv')
test = pd.read_csv('/kaggle/input/jobathon-may-2021-credit-card-lead-prediction/test.csv')

In [None]:
train.columns, test.columns

In [None]:
train.nunique()

In [None]:
import pandas_profiling as pp

profile = pp.ProfileReport(train)
# profile.to_file("Train.html")
profile

# Let's check what are the corelated columns when our final output is '1' i.e. it predicts a Lead.

Also label encoding it , so that we can check with all the columns. 

In [None]:
train_1 = train.loc[train['Is_Lead'] == 1]

from sklearn import preprocessing 

le = preprocessing.LabelEncoder()

for col in train.drop('ID', axis = 1).select_dtypes('object').columns:
    le = preprocessing.LabelEncoder()
    train_1[col] = le.fit_transform(train_1[col])
    

import pandas_profiling as pp

profile_1 = pp.ProfileReport(train_1, title = "Profiling the Is_Lead = 1 segment", explorative = True)
#profile_1 = pp.ProfileReport(interactions = {interactions.targets : train_1['Is_Lead']})
profile_1

### 4 columns have high Correlation with the target column when " WE HAVE A LEAD ": 
1. Credit_Product
2. Gender
3. Channel_Code
4. Is_Active
5. Occupation

In [None]:
train['Is_Lead'].value_counts()

We have a skewed target 

In [None]:
class_weight = int(train['Is_Lead'].value_counts()[0] / train['Is_Lead'].value_counts()[1])

In [None]:
class_weight

### Imbalance between the 2 target Classes . 0:1 Equivalent to 3:1

In [None]:
# Save the initial state of dataframe
train_df = train.copy(deep = True)

In [None]:
# Label Encoding both Train and Test Dataset

from sklearn import preprocessing 

le = preprocessing.LabelEncoder()

cat_columns = []

for col in train.drop('ID', axis = 1).select_dtypes('object').columns:
    print('Train:',col)
    cat_columns.append(col)
    le = preprocessing.LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    
cat_features_index = [i for i, col in enumerate(train.columns) if col in cat_columns]



In [None]:
train.head().style.background_gradient(cmap = "Blues")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model 


X = train.drop(['Is_Lead', 'ID'], axis = 1)
y = train['Is_Lead']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42, stratify = y)

In [None]:
train.columns

## I'll be using Patsy later to generate Feature Interactions between correlated columns

In [None]:
from sklego.preprocessing import PatsyTransformer

pt = PatsyTransformer("(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation)**2-1-(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation) + Age + Region_Code + Vintage + np.log(Avg_Account_Balance )")
pt.fit(X_train, y_train).transform(X_train)

In [None]:
from functools import wraps
import datetime as dt

def log_step(func):
    
    @wraps(func)
    def wrapper(*args, **kwargs):
        
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"just ran step {func.__name__} shape = {result.shape} took {time_taken}s")
        return result
    return wrapper

@log_step
def start_pipeline(dataf):
    return dataf.copy() 

@log_step
def corelation_target(data, target):
    
    """
    Find Co-relation of different features with the "Target" column in Descending Order
    """
    plt.figure(figsize = (8, 12))

    heatmap = sns.heatmap(data.corr()[[target]].drop(index = target, axis = 0).sort_values(by = target, ascending = False),
                         vmin = -1,
                         vmax = 1, 
                         annot = True, 
                         cmap = 'BrBG')

    heatmap.set_title(f"Features Correlating with {target} column", 
                      fontdict = {'fontsize':18}, pad = 16)
    
    return data


@log_step
def corelation_horizontal_target(data, target):
    
    """
    Horizontal Bar Plot of the Co-relation of individual features with the Target Column 
    """
    plt.figure(figsize=(10, 12))

    corr = data.corr()[[target]].drop(index = target, axis = 0) # Removes the 1st row i.e. Corelation of target with itself
    plt.barh(corr.index, corr.reset_index(drop = True).to_numpy().ravel())
    plt.title("Corelation with target")
    plt.figure(figsize=(12, 22))
    plt.show()
    
    return data





In [None]:
train_d = (train
           .pipe(start_pipeline)
           .pipe(corelation_target, target = 'Is_Lead')
           .pipe(corelation_horizontal_target, target = 'Is_Lead')
          )

### Generating Features using SKLEGO Patsy Transformer . 

#### The features that are generated will be using the 4 features that affect the 'Is_Lead = 1' as checked from Pandas Profiling report

1. Credit_Product
2. Gender
3. Channel_Code
4. Is_Active
5. Occupation

# Using Basic and Complex Models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model 

X = train.drop(['Is_Lead', 'ID'], axis = 1)
y = train['Is_Lead']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42, stratify = y)

In [None]:
model_lr = linear_model.LogisticRegression(solver = 'liblinear',class_weight = {0:1, 1: class_weight}).fit(X_train, y_train)
roc_auc_score(model_lr.fit(X_train, y_train).predict(X_test), y_test)
plot_confusion_matrix(model_lr, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true')

In [None]:
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklego.preprocessing import PatsyTransformer

pipe_lr = Pipeline([
    ("patsy", PatsyTransformer("(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation)**2-1-(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation) + Age + Region_Code + Vintage + np.log(Avg_Account_Balance )")),
    ("scale", StandardScaler()),
    ("model", linear_model.LogisticRegression(class_weight = {0:1, 1: class_weight})
    )
])

roc_auc_score(pipe_lr.fit(X_train, y_train).predict(X_test), y_test)

In [None]:
plot_confusion_matrix(pipe_lr, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true')

As per confusion matrix, on comparing with the other algorithms , I am getting the least number of NT (False Positives) and the maximum number of TN (True Negatives) that are classified incorrectly.

Positive is classified incorrectly, while the negative is somewhat better classified as negative. 

In [None]:
from sklego.mixture import GMMClassifier

pipe_GM = Pipeline([
    ("patsy", PatsyTransformer("(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation+np.log(Avg_Account_Balance))**2")),
    ("scale", StandardScaler()),
    ("model", GMMClassifier(n_components = 4)
    )
])

pred = pipe_GM.fit(X_train, y_train).predict(X_test)

roc_auc_score(pred, y_test)

In [None]:
plot_confusion_matrix(pipe_GM, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true')

## Interesting. 

### Gausian Mixture Model ( SKLEGO ) gives a good extra bump for the score, but my Confusion Matrix is Skewed.

In [None]:
from sklearn.ensemble import StackingClassifier

estimators = [
             ('Logistic Regression', linear_model.LogisticRegression(class_weight = {0:1, 1: class_weight}, solver = 'liblinear')),
             ('GMM', GMMClassifier(n_components = 4))
             ]

model_stack = StackingClassifier(
                            estimators = estimators, 
                            final_estimator = linear_model.SGDClassifier()
                        )
pred = model_stack.fit(X_train, y_train).predict(X_test)

roc_auc_score(pred, y_test)

In [None]:
plot_confusion_matrix(model_stack, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true')

Predictions have improved a bit , and my roc_auc score has reduced

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn import ensemble

model_h = ensemble.HistGradientBoostingClassifier(
                                     scoring = 'roc_auc',
                                     warm_start = True,
                                     ).fit(X_train, y_train)

pred = model_h.predict(X_test)

roc_auc_score(pred, y_test)

In [None]:
plot_confusion_matrix(model_h, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true')

### A good bump with HistGradientBoostingClassifier on roc_auc score. 

HistGradientBoostingClassifier has been built wrt LGBM. 
My 'roc_auc' score definitely increases but my predictions are more skewed. 

In [None]:
model_hs = Pipeline([
   # ("patsy", PatsyTransformer("(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation+np.log(Avg_Account_Balance))**2")),
    ("scale", StandardScaler()),
    ("model", ensemble.HistGradientBoostingClassifier(
                                     scoring = 'roc_auc',
                                     warm_start = True,
                                     ).fit(X_train, y_train)
    )
]).fit(X_train, y_train)

pred = model_hs.predict(X_test)

roc_auc_score(pred, y_test)

In [None]:
plot_confusion_matrix(model_hs, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true')

Using Patsy roc_auc is 0.5 that is 50 %. 
Without Patsy it still gives a good score. 


But my predictions are still skewed.

In [None]:
from sklearn import neighbors
pipe_sgd = Pipeline([
    ("patsy", PatsyTransformer("(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation)**2")),
    ("scale", StandardScaler()),
    ("model", linear_model.SGDClassifier(class_weight = {0:1, 1: class_weight})
    )
])

roc_auc_score(pipe_sgd.fit(X_train, y_train).predict(X_test), y_test)

In [None]:
plot_confusion_matrix(pipe_sgd, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true')

They give a much better outcome as per Confusion Matrix , but lesser score as per ROC_AUC

## Random Forest Without Class-weight

In [None]:
from sklearn import ensemble

clf_rf = ensemble.RandomForestClassifier(n_estimators = 300)

clf_rf.fit(X_train, y_train)
pred = clf_rf.predict(X_test)

roc_auc_score(pred, y_test)

In [None]:
plot_confusion_matrix(clf_rf, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true')

## Random Forest with Class-weight

In [None]:
clf_rfw = ensemble.RandomForestClassifier(n_estimators = 300, class_weight = {0:1, 1: class_weight})

clf_rfw.fit(X_train, y_train)

pred = clf_rfw.predict(X_test)

roc_auc_score(pred, y_test)

In [None]:
plot_confusion_matrix(clf_rfw, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true')

Score bumps up by 0.001

## XGB without Class-weight

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier(seed = 42)

xgb.fit(X_train, y_train)

pred = xgb.predict(X_test)

roc_auc_score(pred, y_test)

In [None]:
plot_confusion_matrix(xgb, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true')

## XGB with Class-Weight

In [None]:
xgb_w = XGBClassifier(scale_pos_weight = class_weight,seed = 42)

xgb_w.fit(X_train, y_train)

pred = xgb_w.predict(X_test)

roc_auc_score(pred, y_test)

In [None]:
plot_confusion_matrix(xgb_w, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true')

### ROC_AUC bumps down by 0.7 . Check why 

# Observation : 

## *******************

1. As we start using complex models , the 'ROC_AUC' score definitely bumps up , but we find that it is incorrectly classifying negative data as positive. 
   This can eb seen in 
*    XGB without Class weight 
* Random Forest (Generally)
* HistGradientBoostingClassifer ( Similar to LightGBM )
* ensemble Classifier

2. As we move to complex models 01 i.e. False-Positives are reduced substantially ,
   but 10  i.e. True-Negative do increase. 

3. A simple Logistic Regression is the only model that gives us Balanced values between both 01 and 10 i.e. 0.23 and 0.24

# What do we do now ? 

![](https://media.giphy.com/media/lraXagM6W2ae23iiRC/giphy-downsized.gif)

## If these models are deployed in production , we would need extra tests to curb the 10 i.e. True-Negatives .

### COMPLEX MODELS GIVE A GOOD SCORE BUT THEY ARE NOT USEFUL in Production

**Is this due to imbalance ? That more True values are classified negatively**

For imbalanced classes , ROC_AUC score is not a useful metric

As per below discussion by CPMP, SMOTE does not provide a bump in the performance of the model. 
Label and Prediction Smoothing does provide an improvement . 

https://www.kaggle.com/c/lish-moa/discussion/191545

Let's take it again from the beginning

In [None]:
train = pd.read_csv('/kaggle/input/jobathon-may-2021-credit-card-lead-prediction/train.csv')
test = pd.read_csv('/kaggle/input/jobathon-may-2021-credit-card-lead-prediction/test.csv')

In [None]:
# Label Encoding both Train and Test Dataset

from sklearn import preprocessing 

le = preprocessing.LabelEncoder()

cat_columns = []

for col in train.drop('ID', axis = 1).select_dtypes('object').columns:
    print('Train:',col)
    cat_columns.append(col)
    le = preprocessing.LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    
cat_features_index = [i for i, col in enumerate(train.columns) if col in cat_columns]



In [None]:
train.columns

In [None]:
train.nunique()

In [None]:
import seaborn as sns
sns.pairplot(train.drop('ID', axis = 1), y_vars = 'Is_Lead')

In [None]:
# USEFUL TOOL
train['Avg_Account_Balance'].value_counts(bins = 6)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model 

X = train.drop(['Is_Lead', 'ID'], axis = 1)
y = train['Is_Lead']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42, stratify = y)

## Generating Feature Interaction using PatsyTransformer

In [None]:
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklego.preprocessing import PatsyTransformer
from sklego.mixture import GMMClassifier


class_weight = 3

pipe_lrn = Pipeline([
    ("patsy", PatsyTransformer("(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation)**2-1-(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation) + np.log(Age) + Region_Code + Vintage + np.log(Avg_Account_Balance )")),
   # ("Interaction", preprocessing.PolynomialFeatures()),  # Fit the attributes to a normal Distribution
    ("Normalize", preprocessing.RobustScaler()),  # Robust to outliers
    ("model", linear_model.LogisticRegression(class_weight = {0:1, 1: class_weight}, solver = 'liblinear', max_iter = 1000))
])

pred = pipe_lrn.fit(X_train, y_train).predict(X_test)
roc_auc_score(pred, y_test)

In [None]:
plot_confusion_matrix(pipe_lrn, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true'
                     )

In [None]:
from sklearn import set_config                      # to change the display
from sklearn.utils import estimator_html_repr       # to save the diagram into HTML format
from IPython.core.display import display, HTML      # to visualize pipeline

set_config(display='diagram')
display(HTML(estimator_html_repr(pipe_lrn)))#

### Below are the Estimators we can use in Grid Search

In [None]:
pipe_lrn.get_params()

# Using different metrics for refitting to tune the model

**Using the mathew correlation coefficient as it balances the 4 predictions. [ TN,TP,FN,FP ]**

In [None]:
from sklearn import model_selection
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.experimental import enable_halving_search_cv



param_grid = dict(C = [100,10,1,0.1,0.01])
grid_search1 = model_selection.HalvingRandomSearchCV(pipe_lrn , 
                                           param_distributions = {
                                                   #       'Interaction__degree' : [2,3],
                                                          'model__C': [100, 10, 1.0, 0.1, 0.01],
                                                          'model__warm_start':[False, True],
                                               #           'Interaction__include_bias':[False,True],
                                                    #      'Interaction__interaction_only':[True, False],
                                                          'model__class_weight': [{0: 1, 1: v} for v in np.linspace(1,20, 30)]
                                                         },
                                           aggressive_elimination = True,
                                           n_jobs = -1,
                                           scoring = 'roc_auc',
                                           refit = 'matthews_corrcoef',
                                           verbose = 2,
                                           cv = 3)
grid_search1.fit(X_train, y_train)

In [None]:
pred = grid_search1.predict(X_test)
roc_auc_score(pred, y_test)

plot_confusion_matrix(grid_search1, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true'
                     )

In [None]:
roc_auc_score(pred, y_test)

ROC_AUC score has bumped down. 

### Considering the Predictions TP is damn good, but it also has a high prediction for TN . 

In [None]:
pd.DataFrame(grid_search1.cv_results_)

My training score is also pretty good. 

A bit of new trial, 

Refitting on 'ROC_AUC' and 'f1_score' to give a 

In [None]:
%%time

param_grid = dict(C = [100,10,1,0.1,0.01])
grid_search1 = model_selection.HalvingGridSearchCV(pipe_lrn , 
                                           param_grid = {
                                                      #    'Interaction__degree' : [2,3],
                                                          'model__C': [100, 10, 1.0, 0.1, 0.01],
                                                          'model__warm_start':[False, True],
                                               #           'Interaction__include_bias':[False,True],
                                            #             'Interaction__interaction_only':[True, False],
                                                          'model__class_weight': [{0: 1, 1: v} for v in np.linspace(1,20, 30)]
                                                         },
                                           aggressive_elimination = True,
                                           n_jobs = -1,
                                           scoring = 'roc_auc',
                                           refit = {'matthews_corrcoef', 'f1_score'},
                                           verbose = 2,
                                           cv = 5)
grid_search1.fit(X_train, y_train)

In [None]:
pred = grid_search1.predict(X_test)


plot_confusion_matrix(grid_search1, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true'
                     )

TN score has increased from 0.45 to 0.49

In [None]:
roc_auc_score(pred, y_test)

In [None]:
grid_search1.best_params_

### One thing that I hope you notice is that the confusion matrix is not improving despite the various Grid Search and Random Search 

### Also if you take into consideration the training score it is pretty damn good.

## Let's try tweaking the dataset. 

Trying Feature Interaction using Patsy.

In [None]:
pt = PatsyTransformer("(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation)**2-1-(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation) + np.log(Age) + Region_Code + Vintage + np.log(Avg_Account_Balance )")
ps = pt.fit(X_train, y_train).transform(X_train)

**'Terms' are the Columns that are generated using Feature Interactions**

In [None]:
ps

In [None]:
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklego.preprocessing import PatsyTransformer
from sklego.mixture import GMMClassifier


class_weight = 3

pipe_lri = Pipeline([
    ("patsy", PatsyTransformer("(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation)**2-1-(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation) + np.log(Age) + Region_Code + Vintage + np.log(Avg_Account_Balance )")),
   # ("Interaction", preprocessing.PolynomialFeatures()),  # Fit the attributes to a normal Distribution
    ("Normalize", preprocessing.RobustScaler()),  # Robust to outliers
    ("model", linear_model.LogisticRegression(class_weight = {0:1, 1: class_weight}, solver = 'liblinear', max_iter = 1000))
])

pred = pipe_lri.fit(X_train, y_train).predict(X_test)
roc_auc_score(pred, y_test)

In [None]:
plot_confusion_matrix(pipe_lri, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true'
                     )

Back to square 1 I guess

In [None]:
%%time

param_grid = dict(C = [100,10,1,0.1,0.01])
grid_search1 = model_selection.HalvingGridSearchCV(pipe_lri , 
                                           param_grid = {
                                                #          'Interaction__degree' : [2,3],
                                                          'model__C': [100, 10, 1.0, 0.1, 0.01],
                                                          'model__warm_start':[False, True],
                                               #          'Interaction__include_bias':[False,True],
                                               #          'Interaction__interaction_only':[True, False],
                                                          'model__class_weight': [{0: 1, 1: v} for v in np.linspace(1,20, 30)]
                                                         },
                                           aggressive_elimination = True,
                                           n_jobs = -1,
                                           scoring = 'roc_auc',
                                           refit = {'roc_auc','f1_score'},
                                           verbose = 2,
                                           cv = 5)
grid_search1.fit(X_train, y_train)

In [None]:
plot_confusion_matrix(grid_search1, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true'
                     )

### That's a good model

In [None]:
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklego.preprocessing import PatsyTransformer
from sklego.mixture import GMMClassifier


class_weight = 3

pipe_lrp = Pipeline([
    ("patsy", PatsyTransformer("(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation)**2-1-(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation) + np.log(Age) + Region_Code + Vintage + np.log(Avg_Account_Balance )")),
  #  ("Interaction", preprocessing.PolynomialFeatures()),  # Fit the attributes to a normal Distribution
    ("Normalize", preprocessing.RobustScaler()),  # Robust to outliers
    ("model", linear_model.LogisticRegression(class_weight = {0:1, 1: class_weight}, solver = 'liblinear', max_iter = 1000))
])

pred = pipe_lri.fit(X_train, y_train).predict(X_test)
roc_auc_score(pred, y_test)

In [None]:
%%time

##   ACTIVATING BOTH INTERACTION FEATURES

param_grid = dict(C = [100,10,1,0.1,0.01])
grid_search1 = model_selection.HalvingGridSearchCV(pipe_lrp , 
                                           param_grid = {
                                                     #     'Interaction__degree' : [2,3],
                                                          'model__C': [100, 10, 1.0, 0.1, 0.01],
                                                          'model__warm_start':[False, True],
                                                      #    'Interaction__include_bias':[False,True],
                                                      #    'Interaction__interaction_only':[True, False],
                                                          'model__class_weight': [{0: 1, 1: v} for v in np.linspace(1,20, 30)]
                                                         },
                                           aggressive_elimination = True,
                                           n_jobs = -1,
                                           scoring = 'roc_auc',
                                           refit = 'roc_auc',
                                           verbose = 2,
                                           cv = 10)
grid_search1.fit(X_train, y_train)

In [None]:
plot_confusion_matrix(grid_search1, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true'
                     )

### Let's try an ensemble of few basic classifiers

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import VotingClassifier

knn = KNeighborsClassifier()
gmm = GMMClassifier(n_components = 4)
rf = RandomForestClassifier(class_weight = {0:1, 1: class_weight})
lr = linear_model.LogisticRegression(class_weight = {0:1, 1: class_weight}, solver = 'liblinear', max_iter = 1000)

classifiers = [('knn', knn),
               ('gmm', gmm),
               ('rf', rf),
               ('lr', lr)]

vc = VotingClassifier(estimators=classifiers, voting='hard')

pipe_lri = Pipeline([
    ("patsy", PatsyTransformer("(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation)**2-1-(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation) + np.log(Age) + Region_Code + Vintage + np.log(Avg_Account_Balance )")),
   # ("Interaction", preprocessing.PolynomialFeatures()),  # Fit the attributes to a normal Distribution
    ("Normalize", preprocessing.RobustScaler()),  # Robust to outliers
    ("model", vc)
])

pred = pipe_lri.fit(X_train, y_train).predict(X_test)
roc_auc_score(pred, y_test)

In [None]:
plot_confusion_matrix(pipe_lri, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true'
                     )

Here we are predicting the FP , with a high probability, 
and my TP prediction is very low .

# Using Transformations of individual features and testing them . 

In [None]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn import linear_model, preprocessing
from sklearn import ensemble
from sklego.preprocessing import PatsyTransformer

from sklego.meta import EstimatorTransformer
from sklego.preprocessing import ColumnSelector

feature_pipeline = Pipeline([
    ("datagrab", FeatureUnion([
        ("discrete", Pipeline([
            ("Account_grab", ColumnSelector(["Avg_Account_Balance"])),
            ("Quantile", preprocessing.QuantileTransformer(random_state=0)),
        ])),
        ("Normalize", Pipeline([
            ("Age_grab",ColumnSelector(["Age"])),
            ("Normalize", preprocessing.Normalizer())
        ])),

        ("Onehotencode_columns", Pipeline([
            ("OneHot_grab",ColumnSelector(["Credit_Product", "Occupation"])),
            ("OneHotEncode", preprocessing.OneHotEncoder(categories = "auto", sparse = False))
        ])),
    ("Interaction", FeatureUnion([
      ("Interaction Pipeline", Pipeline([
          ("Lead Correlated Columns", ColumnSelector(["Credit_Product","Gender","Channel_Code","Is_Active","Occupation"])),
            ("interaction", preprocessing.PolynomialFeatures(include_bias = False, interaction_only = True)),
      ]))
  ])),
    ])),
  ("Standardization", FeatureUnion([
     ("Standardize", preprocessing.StandardScaler())
  ])),
#   ("ml_features", FeatureUnion([
#        ("XGBoostClassifier", EstimatorTransformer(linear_model.LogisticRegression(class_weight = {0:1, 1: class_weight}, solver = 'liblinear', max_iter = 1000))),
#        ("Random Forest Classifier", EstimatorTransformer(ensemble.RandomForestClassifier())),
#        ("GMM", EstimatorTransformer(GMMClassifier(n_components = 4))),
#        ("KNN",EstimatorTransformer(KNeighborsClassifier()))
#    ])),
# ("Ridge", EstimatorTransformer(linear_model.Ridge())),

 #   ("patsy", PatsyTransformer("(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation)**2-1-(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation) + np.log(Age) + Region_Code + Vintage + np.log(Avg_Account_Balance )")),
   
])

In [None]:
pipe = Pipeline([
    ("transform", feature_pipeline),
 #   ("Random Forest Classifier", (ensemble.RandomForestClassifier())),
#    ("XGBoostClassifier", EstimatorTransformer(XGBClassifier(seed = 42))),
    ("Logistic Regression", (linear_model.LogisticRegression(class_weight = {0:1, 1: 3}, solver = 'liblinear', max_iter = 1000))),
#    ("Extra tree Classifier", ensemble.ExtraTreesClassifier(class_weight = {0:1, 1: 3},n_estimators=100, random_state=0))
])

pipe.fit(X_train, y_train)



In [None]:
pred = pipe.predict(X_test)
roc_auc_score(pred, y_test)

In [None]:
plot_confusion_matrix(pipe, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true'
                     )


In [None]:
pipe.get_params()

In [None]:
%%time

from sklearn import model_selection
from sklearn.experimental import enable_halving_search_cv


grid_search1 = model_selection.HalvingGridSearchCV(pipe , 
                                           param_grid = {
                                                          'transform__datagrab__Interaction__Interaction Pipeline__interaction__degree' : [2,3],
                                                          'transform__datagrab__Interaction__Interaction Pipeline__interaction__interaction_only':[True, False],
                                                          'Logistic Regression__C': [100, 10, 1.0, 0.1, 0.01],
                                                          'Logistic Regression__warm_start':[False, True],
                                                      #    'Interaction__include_bias':[False,True],
                                                          'Logistic Regression__class_weight': [{0: 1, 1: v} for v in np.linspace(1,20, 30)]
                                                         },
                                           aggressive_elimination = True,
                                           n_jobs = -1,
                                           scoring = 'roc_auc',
                                           refit = {'log_loss', 'matthews_corrcoef'},
                                           verbose = 2,
                                           cv = 5)
grid_search1.fit(X_train, y_train)

In [None]:
roc_auc_score(grid_search1.predict(X_test), y_test)

In [None]:
plot_confusion_matrix(grid_search1, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true'
                     )

 ## Overfitting on 'True' value. 

## I guess it might be due to the feature interaction. 
## Let's tweak that. 

In [None]:
pd.DataFrame(grid_search1.cv_results_)

In [None]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn import linear_model, preprocessing
from sklearn import ensemble
from sklego.preprocessing import PatsyTransformer

from sklego.meta import EstimatorTransformer
from sklego.preprocessing import ColumnSelector

feature_pipeline = Pipeline([
    ("datagrab", FeatureUnion([
        ("discrete", Pipeline([
            ("Account_grab", ColumnSelector(["Avg_Account_Balance"])),
            ("Quantile", preprocessing.QuantileTransformer(random_state=0)),
        ])),
        ("Normalize", Pipeline([
            ("Age_grab",ColumnSelector(["Age"])),
            ("Normalize", preprocessing.Normalizer())
        ])),

        ("Onehotencode_columns", Pipeline([
            ("OneHot_grab",ColumnSelector(["Credit_Product", "Occupation"])),
            ("OneHotEncode", preprocessing.OneHotEncoder(categories = "auto", sparse = False))
        ])),
    
    ])),
  ("Standardization", FeatureUnion([
     ("Standardize", preprocessing.StandardScaler())
  ])),
    ("Interaction", FeatureUnion([
      ("Interaction Pipeline", Pipeline([
         ("interaction", preprocessing.PolynomialFeatures(include_bias = False, interaction_only = True)),
      ]))
  ])),
#   ("ml_features", FeatureUnion([
#        ("XGBoostClassifier", EstimatorTransformer(linear_model.LogisticRegression(class_weight = {0:1, 1: class_weight}, solver = 'liblinear', max_iter = 1000))),
#        ("Random Forest Classifier", EstimatorTransformer(ensemble.RandomForestClassifier())),
#        ("GMM", EstimatorTransformer(GMMClassifier(n_components = 4))),
#        ("KNN",EstimatorTransformer(KNeighborsClassifier()))
#    ])),
# ("Ridge", EstimatorTransformer(linear_model.Ridge())),

 #   ("patsy", PatsyTransformer("(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation)**2-1-(C(Credit_Product)+Gender+Channel_Code+Is_Active+Occupation) + np.log(Age) + Region_Code + Vintage + np.log(Avg_Account_Balance )")),
   
])

In [None]:
pipe = Pipeline([
    ("transform", feature_pipeline),
 #   ("Random Forest Classifier", (ensemble.RandomForestClassifier())),
#    ("XGBoostClassifier", EstimatorTransformer(XGBClassifier(seed = 42))),
    ("Logistic Regression", (linear_model.LogisticRegression(class_weight = {0:1, 1: 3}, solver = 'liblinear', max_iter = 1000))),
#    ("Extra tree Classifier", ensemble.ExtraTreesClassifier(class_weight = {0:1, 1: 3},n_estimators=100, random_state=0))
])

pipe.fit(X_train, y_train)




In [None]:
plot_confusion_matrix(pipe, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true'
                     )

Seems back to square 1 

In [None]:
roc_auc_score(pipe.predict(X_test), y_test)

In [None]:
pipe.get_params()

In [None]:
%%time

from sklearn import model_selection
from sklearn.experimental import enable_halving_search_cv


grid_search1 = model_selection.HalvingGridSearchCV(pipe , 
                                           param_grid = {
                                                          'transform__Interaction__Interaction Pipeline__interaction__degree' : [2,3],
                                                          'transform__Interaction__Interaction Pipeline__interaction__interaction_only':[True, False],
                                                          'Logistic Regression__C': [100, 10, 1.0, 0.1, 0.01],
                                                          'Logistic Regression__warm_start':[False, True],
                                                      #    'Interaction__include_bias':[False,True],
                                                          'Logistic Regression__class_weight': [{0: 1, 1: v} for v in np.linspace(1,20, 30)]
                                                         },
                                           aggressive_elimination = True,
                                           n_jobs = -1,
                                           scoring = 'roc_auc',
                                           refit = {'roc_auc', 'matthews_corrcoef'},
                                           verbose = 2,
                                           cv = 5)
grid_search1.fit(X_train, y_train)

In [None]:
plot_confusion_matrix(pipe, X_test, y_test,
                     display_labels = np.unique(y),
                     cmap = plt.cm.Blues,
                     normalize = 'true'
                     )

In [None]:
roc_auc_score(pipe.predict(X_test), y_test)

Even with the pipeline , it's not improving