# Cross sell prediction using logistic regression and stochastic gradient descent after upsampling using imblearn

**Hyperparameters were verified using GridSearchCV (5 folds, ROC AUC: 0.8292). The full code with detailed explanation can be found [in my GitHub repository](https://github.com/CrisMW/insurance_cross_sell).**

In [None]:
import numpy as np
import pandas as pd

# Loading training data,setting y_train, X_train

sell_data_train = pd.read_csv('../input/health-insurance-cross-sell-prediction/train.csv', index_col='id')
y_train = np.array(sell_data_train[['Response']])
X_train = sell_data_train.drop('Response', axis=1)

X_train.head()

In [None]:
# Loading X_test

X_test = pd.read_csv('../input/health-insurance-cross-sell-prediction/test.csv', index_col='id')
X_test.head()

In [None]:
# Defining Column Transformer
# Column transformer drops license, transforms other cols


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler


hot_ft = [0, 6]
ord_ft = [5]
drop_ft = [2]
scale_ft = [1, 3, 8, 9]

ct = ColumnTransformer(
    transformers=[
        ('ordinal', OrdinalEncoder(categories=[['< 1 Year', '1-2 Year', '> 2 Years']]), ord_ft),
        ('one_hot', OneHotEncoder(), hot_ft),
        ('scale', MinMaxScaler(), scale_ft),
        ('to_drop', 'drop', drop_ft)
    ], remainder='passthrough')

In [None]:
# Defining function to generate pipeline with ColumnTransformer, SMOTENC, and a classifier.
# The cache directory was populated from the cache generated by previous work offline (not shown)

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTENC

def make_pipe(clf):
    """Makes pipieline with desired classifier and pre-set transformers. 
    Args:
        clf (sklearn estimator): Desired classifier. Inclue desired parameters.
    Returns:
        pipe (imblearn.pipeline.Pipeline) 
    """  
    pipe = Pipeline(steps=[
        ('sampling', SMOTENC(categorical_features=ord_ft+hot_ft, random_state=42)),
        ('transfomer', ct),
        ('clf', clf)
    ], memory='./pipe_cache')

    return pipe

## Determining ROC AUC <u>on test set</u>

Showing ROC AUC <u>on test set</u> obtained with different models after upsampling. For each model, optimal hyperparameters were determined via GridSearchCV ([see my GitHub repository](https://github.com/CrisMW/insurance_cross_sell)).
### Defining metrics

In [None]:
# Importing metrics, defining list of metrics

from sklearn.metrics import roc_auc_score, recall_score
from sklearn.model_selection import cross_validate

metrics = ['roc_auc', 'recall']

# Function to calculate averages from dictionary returned by cross_validate

def avg_scores(metrics_list, score_dict):
    """Average the metrics returned by sklearn.model_selection.cross_validate() across
    all folds.
    Args:
        metrics_list (list): List containing metrics passed to cross_validate().
        score_dict (dict): Dict returned by cross_calidate().
    Returns:
        avg_dict (dict): Dictionary containing the averaged metrics. 
    """
    avg_dict = {}
    for m in metrics_list:
        avg_dict[m] = np.mean(score_dict['test_{}'.format(m)])
    return avg_dict

### Logistic regression

In [None]:
# Making pipeline

from sklearn.linear_model import SGDClassifier
log_model = make_pipe(SGDClassifier(loss='log', random_state=42, max_iter=2000, alpha=1, l1_ratio=0.05, penalty='elasticnet'))

# Obtaining ROC AUC, recall

log_scores = cross_validate(log_model, X_train, y_train, scoring=metrics)

In [None]:
# Printing ROC AUC, recall

log_avg_scores = avg_scores(metrics, log_scores)
print(log_avg_scores)

### Linear support vector classifier

In [None]:
# Making pipeline

from sklearn.svm import LinearSVC
svc_model = make_pipe(LinearSVC(penalty='l1', C=100, dual=False, random_state=42))

# Obtaining ROC AUC, recall

svc_scores = cross_validate(svc_model, X_train, y_train, scoring=metrics)

In [None]:
# Printing ROC AUC, recall

svc_avg_scores = avg_scores(metrics, svc_scores)
print(svc_avg_scores)

In [None]:
# # Transforming prediction to pd.DataFrame

# log_pred_df = pd.DataFrame(data=log_pred, index=X_test.index, columns=['Response'])

# # Saving to csv

# log_pred_df.to_csv('./log_submission.csv')