In [None]:
from sklearn.metrics import log_loss
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,roc_auc_score,confusion_matrix
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix,r2_score
import warnings
from mlxtend.classifier import StackingClassifier
import missingno as msno
from sklearn.ensemble import VotingClassifier
import shap
shap.initjs()
import lime
from lime import lime_tabular
warnings.simplefilter('ignore')
import os

In [None]:
insurance_df = pd.read_csv('../input/prudential-life-insurance-assessment/train.csv.zip', index_col='Id')
insurance_df.head()

#Combining the Categores to 3 categories
insurance_df['Modified_Response']  = insurance_df['Response']
sns.countplot(x= insurance_df['Modified_Response']);
# Dropping old response columns
insurance_df.drop('Response',axis = 1, inplace=True)

# Making lists with categorical and numerical features.
categorical =  [col for col in insurance_df.columns if insurance_df[col].dtype =='object']

numerical = categorical =  [col for col in insurance_df.columns if insurance_df[col].dtype !='object']

In [None]:
insurance_df_test = pd.read_csv('../input/prudential-life-insurance-assessment/test.csv.zip', index_col='Id')


insurance_df_test['Modified_Response'] = np.zeros(insurance_df_test.shape[0])


whole_df = pd.concat([insurance_df, insurance_df_test], ignore_index=True, sort=False)

whole_df.head()
print(insurance_df.shape)
print(insurance_df_test.shape)
print(whole_df.shape)

In [None]:
#checking percentage of missing values in a column
missing_val_count_by_column = whole_df.isnull().sum()/len(whole_df)

print(missing_val_count_by_column[missing_val_count_by_column > 0.4].sort_values(ascending=False))

# Dropping all columns in which greater than 40 percent null values
whole_df = whole_df.dropna(thresh=whole_df.shape[0]*0.4,how='all',axis=1)
# Does not contain important information
whole_df.drop('Product_Info_2',axis=1,inplace=True)





In [None]:
insurance_df = whole_df.iloc[0:insurance_df.shape[0]]
insurance_df_test = whole_df.iloc[insurance_df.shape[0]:whole_df.shape[0]]

print(whole_df.shape)
print(insurance_df.shape)
print(insurance_df_test.shape)

In [None]:
# Data for all the independent variables
X = insurance_df.drop(labels='Modified_Response',axis=1)

# Data for the dependent variable
Y = insurance_df['Modified_Response']

# Filling remaining missing values with mean
X = X.fillna(X.mean())

X_TEST = insurance_df_test.drop(labels='Modified_Response',axis=1)
X_TEST = X_TEST.fillna(X_TEST.mean())


In [None]:
# Train-test split

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.25, random_state=1)
# Check the shape of train dataset
print(X_train.shape,Y_train.shape)

# Check the shape of test dataset
print(X_test.shape, Y_test.shape)

In [None]:
def check_scores(model, X_train, X_test ):
  # Making predictions on train and test data

  train_class_preds = model.predict(X_train)
  test_class_preds = model.predict(X_test)


  # Get the probabilities on train and test
  train_preds = model.predict_proba(X_train)
  test_preds = model.predict_proba(X_test)


  # Calculating accuracy on train and test
  train_accuracy = accuracy_score(Y_train,train_class_preds)
  test_accuracy = accuracy_score(Y_test,test_class_preds)

  print("The accuracy on train dataset is", train_accuracy)
  print("The accuracy on test dataset is", test_accuracy)
  print()
  # Get the confusion matrices for train and test
  train_cm = confusion_matrix(Y_train,train_class_preds)
  test_cm = confusion_matrix(Y_test,test_class_preds )

  print('Train confusion matrix:')
  print( train_cm)
  print()
  print('Test confusion matrix:')
  print(test_cm)
  print()

  # Get the roc_auc score for train and test dataset
  train_auc = roc_auc_score(Y_train,train_preds, multi_class="ovr")
  test_auc = roc_auc_score(Y_test,test_preds,  multi_class="ovr")

  print('ROC on train data:', train_auc)
  print('ROC on test data:', test_auc)
  
  
  return model, train_auc, test_auc, train_accuracy, test_accuracy

In [None]:
#Logistic regression

clf_log = LogisticRegression(random_state=0, solver='sag').fit(X_train, Y_train)
check_scores(clf_log, X_train, X_test)

In [None]:
# CART

from sklearn.tree import DecisionTreeClassifier


clf_cart =  DecisionTreeClassifier().fit(X_train, Y_train)
check_scores(clf_cart, X_train, X_test)

In [None]:
#XGBOOST
import xgboost as xgb

clf_xgb = xgb.XGBClassifier(random_state=42, use_label_encoder=True, eval_metric='logloss').fit(X_train, Y_train)
check_scores(clf_xgb, X_train, X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import random

# Define the grid that we want to search over
param_grid = {'C': np.arange(0.001, 1, 0.15), 
              'penalty': ['l2', 'l1'], 
              'solver': ['liblinear']}

# Define the parameters for the model 
gs_log = GridSearchCV(LogisticRegression(random_state=0, max_iter = 1000, solver='sag'),
                  return_train_score=True, 
                  param_grid=param_grid, 
                  scoring='f1_macro',
                  cv=5, verbose = 0)
## Fit the model
random.seed(42)
gs_log.fit(X_train, Y_train)

In [None]:
clg_log_cv = gs_log.best_estimator_
check_scores(clg_log_cv, X_train, X_test)

In [None]:
# Define the grid that we want to search over
param_grid = {"max_depth": np.arange(3,10,1), "criterion": ['gini', 'entropy']}

# Define the parameters for the model 
gs_cart = GridSearchCV(DecisionTreeClassifier(random_state=42),
                  return_train_score=True, 
                  param_grid=param_grid, 
                  scoring='f1_macro',
                  cv=5, verbose = 0)

## Fit the model
random.seed(1)
gs_cart.fit(X_train, Y_train)

In [None]:
clg_cart_cv = gs_cart.best_estimator_
check_scores(clg_cart_cv, X_train, X_test)

In [None]:
param_grid = {
    'n_estimators': [100,200],
     'max_depth': np.arange(2,6,1),     
}
        
# Define the parameters for the model 
gs_xgb = GridSearchCV(xgb.XGBClassifier(random_state=42, use_label_encoder=True, eval_metric='logloss'),
                  return_train_score=True, 
                  param_grid=param_grid, 
                  scoring='f1_macro',
                  cv=5, verbose = 0)

## Fit the model
random.seed(1)
gs_xgb.fit(X_train, Y_train)

In [None]:
clg_xgb_cv = gs_xgb.best_estimator_
check_scores(clg_xgb_cv, X_train, X_test)

In [None]:
#We keep our train/test set from previously but we separate our train set into train and val.
from sklearn import metrics

X_train_ensemble, X_val_ensemble, y_train_ensemble, y_val_ensemble = train_test_split(X_train, Y_train, 
                                                    train_size = 0.75, random_state = 6,
                                                   stratify = Y_train)

def create_prediction_data(model_list, X_train, y_train, X_val, y_val, X_test, y_test, verbose=False):
    df_prob = np.zeros((X_test.shape[0],8))
    selected = np.zeros(X_test.shape[0])
    for key in model_list:
        model_list[key].fit(X_train, y_train)
        df_prob = df_prob  + model_list[key].predict_proba(X_test)/3
        if verbose:
            print("\n#### " + key +  " ####")
            print("Test AUC: ", metrics.roc_auc_score(y_test, model_list[key].predict_proba(X_test),  multi_class="ovr"))
            print("Test Acc: ", metrics.accuracy_score(y_test, model_list[key].predict(X_test)))
    print("Ensemble Test AUC: ", metrics.roc_auc_score(y_test, df_prob,  multi_class="ovr"))
    selected = np.argmax(df_prob, axis=1)+np.ones(selected.shape[0])
    print("Ensemble Test Acc: ", accuracy_score(y_test,selected))





In [None]:
#We get our ensemble data
d = {'logreg': clg_log_cv, 'cart': clg_cart_cv, 'xgb': clg_xgb_cv}
# d = {'logreg': clf_log, 'cart': clf_cart, 'xgb': clf_xgb}

create_prediction_data(d, X_train_ensemble, y_train_ensemble, X_val_ensemble, y_val_ensemble, X_test, Y_test, verbose = True)

In [None]:
def create_prediction_data_noY(model_list,  X_test):
    df_test_prob = np.zeros((X_test.shape[0],8))
    selected = np.zeros(X_test.shape[0])
    for key in model_list:
        df_test_prob = df_test_prob  + model_list[key].predict_proba(X_test)/3
    selected = np.argmax(df_test_prob, axis=1)
    return selected


Y_PRED = create_prediction_data_noY(d, X_TEST) 
Y_PRED = Y_PRED + np.ones(len(Y_PRED))
Y_PRED = Y_PRED.astype(int)






submission = pd.read_csv('../input/prudential-life-insurance-assessment/sample_submission.csv.zip', index_col='Id')



submission['Response'][:] = Y_PRED

submission.head()

submission.to_csv('submission.csv')

In [None]:

submission.head()
