In [None]:
import plotly.express as px
import pandas as pd
import numpy as np

In [None]:
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
path_train = '../input/health-insurance-cross-sell-prediction/train.csv'
data = pd.read_csv(path_train)

In [None]:
data.info()

In [None]:
data

In [None]:
for col in data.columns:
    print(col)
    print()
    print(data[col].value_counts(normalize=True)*100)
    print('= - ='*20)
    print()

### ***Process***

In [None]:

data.drop(columns=['id','Gender','Driving_License'],inplace = True)  # Driving_license = 1 for most of the data, Gender is neither affecting  response nor avg premuim 
data['Vehicle_Age'].replace({'< 1 Year':0,'1-2 Year' :1,'> 2 Years':2},inplace = True)
data['Vehicle_Damage'].replace({'Yes':0,'No':1},inplace=True)


In [None]:
numerical_cols = ['Age','Annual_Premium','Vintage']
categorical_cols = list(set(data.columns) - set(numerical_cols))

In [None]:
from scipy.stats import zscore
for col in numerical_cols:
    data[col] = data[col].astype('float')
    data[col] = zscore(data[col])

for col in categorical_cols:
    data[col] = data[col].astype('object')

In [None]:
data.isnull().sum()

In [None]:
# Replacing Some Policy channel to others(0), So that we can reduce features.
policy_channels_having_less_counts = data['Policy_Sales_Channel'].value_counts().index[data['Policy_Sales_Channel'].value_counts().values < 200].values
data['Policy_Sales_Channel'].replace(policy_channels_having_less_counts,0,inplace = True)

In [None]:
X_ = data.drop(columns = 'Response')
X = pd.get_dummies(X_)

Y = data['Response']
Y = Y.astype('int64')

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X_, Y, random_state = 0)

### Treating Imbalaced Data

#### UpSampling

In [None]:
from imblearn.over_sampling import SMOTENC

In [None]:
sm = SMOTENC(random_state=42, categorical_features = [1,2,3,4,6], n_jobs=3)
X_up, y_up = sm.fit_resample(x_train, y_train)

In [None]:
X_up.shape

In [None]:
y_up.value_counts()

#### DownSampling

In [None]:
from imblearn.under_sampling import TomekLinks,NearMiss

In [None]:
tl = NearMiss(n_jobs=3)
X_down, y_down = tl.fit_resample(x_train, y_train)

In [None]:
X_down.shape

In [None]:
y_down.value_counts()

### Feature Selection

[link](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest)

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

In [None]:
fs = SelectKBest(score_func=mutual_info_classif, k='all')
fs.fit(X_, Y)

In [None]:
fig = px.bar(x = X_up.columns, y =fs.scores_, template = 'plotly_dark')
fig.show()

In [None]:
fs = SelectKBest(score_func=mutual_info_classif, k='all')
fs.fit(X_up, y_up)   # over sample

In [None]:
fig = px.bar(x = X_up.columns, y =fs.scores_, template = 'plotly_dark')
fig.show()

In [None]:
fs = SelectKBest(score_func=mutual_info_classif, k='all')
fs.fit(X_down, y_down) # down sample

In [None]:
fig = px.bar(x = X_up.columns, y =fs.scores_, template = 'plotly_dark')
fig.show()

In [None]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.metrics import classification_report,hamming_loss,roc_auc_score,confusion_matrix

In [None]:
def base_estimator(x_train, x_test, y_train, y_test,est = LogisticRegression):
#     x_train, x_test, y_train, y_test = train_test_split(data_X,data_Y,random_state = 0)
    
    mod = est(random_state=0)
    mod.fit(x_train.values, y_train.values)
    
    y_train_predict = mod.predict(x_train)
    y_test_predict = mod.predict(x_test)
    
    print(classification_report(y_train,y_train_predict))
    prob = mod.predict_proba(x_train)
    print(f'auc_roc_score : {roc_auc_score(y_train,prob[:,1])}')
    
    print('-'*50)
    print(classification_report(y_test,y_test_predict))
    prob = mod.predict_proba(x_test)
    print(f'auc_roc_score : {roc_auc_score(y_test,prob[:,1])}')

In [None]:
base_estimator(x_train, x_test, y_train, y_test, LogisticRegression)

In [None]:
base_estimator(X_up, x_test, y_up, y_test, LogisticRegression)

In [None]:
base_estimator(pd.get_dummies(X_up), pd.get_dummies(x_test), y_up, y_test, LogisticRegression)  #use a final model

In [None]:
base_estimator(pd.get_dummies(X_up.drop(columns = 'Region_Code')), pd.get_dummies(x_test.drop(columns = 'Region_Code')), y_up, y_test, LogisticRegression)

In [None]:
from sklearn.model_selection import GridSearchCV

#logistic
log_params = {"penalty": ['l2'], 
              'C': [ 0.5, 0.01, 0.1, 1, 2],
             'solver' : ['newton-cg', 'lbfgs', 'sag'],
             'max_iter': [300]}

grid_log = GridSearchCV(LogisticRegression(), log_params, n_jobs=3, cv = 5,return_train_score=True, scoring = 'f1_weighted')
grid_log.fit(X_up, y_up)


In [None]:
grid_log.best_params_

In [None]:
# pd.DataFrame(grid_log.cv_results_).sort_values(by = 'rank_test_score')

In [None]:
model = LogisticRegression(random_state=0,n_jobs = -1,C=2, max_iter=300,penalty = 'l2', solver = 'lbfgs')
model.fit(X_up, y_up)

In [None]:
x_train = X_up
# x_test = pd.get_dummies(x_test)
y_train = y_up

y_train_predict = model.predict(x_train)
y_test_predict = model.predict(x_test)

print(classification_report(y_train,y_train_predict))
prob = model.predict_proba(x_train)
print(f'auc_roc_score : {roc_auc_score(y_train,prob[:,1])}')

print('-'*50)
print(classification_report(y_test,y_test_predict))
prob = model.predict_proba(x_test)
print(f'auc_roc_score : {roc_auc_score(y_test,prob[:,1])}')

#### RF

In [None]:
base_estimator(x_train, x_test, y_train, y_test, RandomForestClassifier)

In [None]:
base_estimator(X_up.drop(columns = 'Region_Code'), x_test.drop(columns = 'Region_Code'), y_up, y_test, RandomForestClassifier)

In [None]:
base_estimator(X_up, x_test, y_up, y_test, RandomForestClassifier)

In [None]:
base_estimator(X_down, x_test, y_down, y_test, RandomForestClassifier) #down sample

In [None]:
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier

#### GB

In [None]:
base_estimator(x_train, x_test, y_train, y_test, GradientBoostingClassifier)

In [None]:
base_estimator(X_up.drop(columns = 'Region_Code'), x_test.drop(columns = 'Region_Code'), y_up, y_test, GradientBoostingClassifier)

In [None]:
base_estimator(X_up, x_test, y_up, y_test, GradientBoostingClassifier)

In [None]:
base_estimator(X_down, x_test, y_down, y_test, GradientBoostingClassifier) #down sample

#### AB

In [None]:
base_estimator(x_train, x_test, y_train, y_test, AdaBoostClassifier)

In [None]:
base_estimator(X_up, x_test, y_up, y_test, AdaBoostClassifier)

### Test

In [None]:
path_test = '../input/health-insurance-cross-sell-prediction/test.csv'
data = pd.read_csv(path_test)

out = pd.DataFrame()
out['id'] = data['id']

In [None]:

data.drop(columns=['id','Gender','Driving_License'],inplace = True)  # Driving_license = 1 for most of the data, Gender is neither affecting  response nor avg premuim 
data['Vehicle_Age'].replace({'< 1 Year':0,'1-2 Year' :1,'> 2 Years':2},inplace = True)
data['Vehicle_Damage'].replace({'Yes':0,'No':1},inplace=True)


In [None]:
numerical_cols = ['Age','Annual_Premium','Vintage']
categorical_cols = list(set(data.columns) - set(numerical_cols))

In [None]:
from scipy.stats import zscore
for col in numerical_cols:
    data[col] = data[col].astype('float')
    data[col] = zscore(data[col]) # standardization

for col in categorical_cols:
    data[col] = data[col].astype('object')

In [None]:
data['Policy_Sales_Channel'].replace(policy_channels_having_less_counts,0,inplace = True)

In [None]:
mod = LogisticRegression(random_state=0,n_jobs=-1)
mod.fit(pd.get_dummies(X_up), y_up)

In [None]:
out['Response']=mod.predict(pd.get_dummies(data))

In [None]:
out.to_csv('submission.csv')