In [None]:
import numpy as np
import pylab as pl
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV

import os

In [None]:
train = pd.read_csv("../input/health-insurance-cross-sell-prediction/train.csv")
test = pd.read_csv("../input/health-insurance-cross-sell-prediction/test.csv")

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.describe()

In [None]:
sns.countplot(train.Response)
plt.show()

In [None]:
Gender  = {'Male': 1,'Female': 0}
train.Gender = [Gender[item] for item in train.Gender]
test.Gender = [Gender[item] for item in test.Gender]

In [None]:
Vehicle_Age  = {'> 2 Years': 0,'1-2 Year': 1,'< 1 Year': 2}
train.Vehicle_Age = [Vehicle_Age[item] for item in train.Vehicle_Age]
test.Vehicle_Age = [Vehicle_Age[item] for item in test.Vehicle_Age]

In [None]:
Vehicle_Damage  = {'Yes': 0,'No': 1}
train.Vehicle_Damage = [Vehicle_Damage[item] for item in train.Vehicle_Damage]
test.Vehicle_Damage = [Vehicle_Damage[item] for item in test.Vehicle_Damage]

In [None]:
y=train['Response']
x=train.drop(["id","Response"], axis=1)

In [None]:
x.head()

In [None]:
y.head()

In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
x_resampled, y_resampled = rus.fit_resample(x, y)

In [None]:
x.shape, y.shape

In [None]:
x_resampled.shape, y_resampled.shape

In [None]:
sns.countplot(y_resampled)
plt.show()

In [None]:
y_resampled

In [None]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
y_resampled=le.fit_transform(y_resampled)

In [None]:
y_resampled

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

for train_index, test_index in sss.split(x_resampled.values, y_resampled):
    print(len(train_index))
    print(len(test_index))
    
    x_train = x_resampled.values[train_index]
    x_val = x_resampled.values[test_index]
    
    y_train = y_resampled[train_index]
    y_val = y_resampled[test_index]

In [None]:
print(x_train.shape, x_val.shape)

In [None]:
print(y_train.shape, y_val.shape)

In [None]:
sns.countplot(y_val)
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)

x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.9)

x_train_pca = pca.fit_transform(x_train_scaled)
x_val_pca = pca.transform(x_val_scaled)

In [None]:
print(x_train_pca.shape, x_val_pca.shape)

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))

plt.show()

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(x_train_pca, y_train)

In [None]:
y_pre_proba = xgb.predict_proba(x_val_pca)

In [None]:
y_pre_proba

In [None]:
from sklearn.metrics import log_loss

log_loss(y_val, y_pre_proba, eps=1e-15, normalize=True)

In [None]:
xgb.get_params

In [None]:
# scores_ne = []
# n_estimators = [100, 200, 300, 400, 500, 600, 700]

In [None]:
# for nes in n_estimators:
#     print("n_estimators:", nes)
#     xgb = XGBClassifier(max_depth=3,
#                         learning_rate=0.1,
#                         n_estimators=nes,
#                         objective="binary:logistic",
#                         n_jobs=-1,
#                         nthread=4,
#                         min_child_weight=1,
#                         subsample=1,
#                         colsample_bytree=1,
#                         seed=42)
    
#     xgb.fit(x_train_pca, y_train)
#     y_pre = xgb.predict_proba(x_val_pca)
#     score = log_loss(y_val, y_pre)
#     scores_ne.append(score)
    
#     print("log_loss:{}".format(score))

In [None]:
# plt.plot(n_estimators, scores_ne, "o-")

# plt.xlabel("n_estimators")
# plt.ylabel("log_loss")
# plt.show()

# print("optimal n_estimator:{}".format(n_estimators[np.argmin(scores_ne)]))

In [None]:
# scores_md = []
# max_depths = [1,3,5,6,7]

In [None]:
# for md in max_depths:
#     print("max_depth:", md)
#     xgb = XGBClassifier(max_depth=md,
#                         learning_rate=0.1,
#                         n_estimators=n_estimators[np.argmin(scores_ne)],
#                         objective="binary:logistic",
#                         n_jobs=-1,
#                         nthread=4,
#                         min_child_weight=1,
#                         subsample=1,
#                         colsample_bytree=1,
#                         seed=42)
    
#     xgb.fit(x_train_pca, y_train)
#     y_pre = xgb.predict_proba(x_val_pca)
#     score = log_loss(y_val, y_pre)
#     scores_md.append(score)
    
#     print("log_loss:{}".format(score))

In [None]:
# plt.plot(max_depths, scores_md, "o-")

# plt.xlabel("max_depths")
# plt.ylabel("log_loss")
# plt.show()

# print("optimal max_depth:{}".format(max_depths[np.argmin(scores_md)]))

In [None]:
# scores_mcw = []
# min_child_weights = [1,2,3,5,7]

In [None]:
# for mcw in min_child_weights:
#     print("min_child_weight:", mcw)
#     xgb = XGBClassifier(max_depth=max_depths[np.argmin(scores_md)],
#                         learning_rate=0.1,
#                         n_estimators=n_estimators[np.argmin(scores_ne)],
#                         objective="binary:logistic",
#                         n_jobs=-1,
#                         nthread=4,
#                         min_child_weight=mcw,
#                         subsample=1,
#                         colsample_bytree=1,
#                         seed=42)
    
#     xgb.fit(x_train_pca, y_train)
#     y_pre = xgb.predict_proba(x_val_pca)
#     score = log_loss(y_val, y_pre)
#     scores_mcw.append(score)
    
#     print("log_loss:{}".format(score))

In [None]:
# plt.plot(min_child_weights, scores_mcw, "o-")

# plt.xlabel("min_child_weight")
# plt.ylabel("log_loss")
# plt.show()

# print("optimal min_child_weight:{}".format(min_child_weights[np.argmin(scores_mcw)]))

In [None]:
# scores_ss = []
# subsamples = [0.1,0.3,0.5,0.7,0.9,1]

In [None]:
# for ss in subsamples:
#     print("subsample:", ss)
#     xgb = XGBClassifier(max_depth=max_depths[np.argmin(scores_md)],
#                         learning_rate=0.1,
#                         n_estimators=n_estimators[np.argmin(scores_ne)],
#                         objective="binary:logistic",
#                         n_jobs=-1,
#                         nthread=4,
#                         min_child_weight=min_child_weights[np.argmin(scores_mcw)],
#                         subsample=ss,
#                         colsample_bytree=1,
#                         seed=42)
    
#     xgb.fit(x_train_pca, y_train)
#     y_pre = xgb.predict_proba(x_val_pca)
#     score = log_loss(y_val, y_pre)
#     scores_ss.append(score)
    
#     print("log_loss:{}".format(score))

In [None]:
# plt.plot(subsamples, scores_ss, "o-")

# plt.xlabel("subsamples")
# plt.ylabel("log_loss")
# plt.show()

# print("optimal subsample:{}".format(subsamples[np.argmin(scores_ss)]))

In [None]:
# scores_cb = []
# colsample_bytrees = [0.1,0.3,0.5,0.7,0.9,1]

In [None]:
# for cb in colsample_bytrees:
#     print("colsample_bytree:", cb)
#     xgb = XGBClassifier(max_depth=max_depths[np.argmin(scores_md)],
#                         learning_rate=0.1,
#                         n_estimators=n_estimators[np.argmin(scores_ne)],
#                         objective="binary:logistic",
#                         n_jobs=-1,
#                         nthread=4,
#                         min_child_weight=min_child_weights[np.argmin(scores_mcw)],
#                         subsample=subsamples[np.argmin(scores_ss)],
#                         colsample_bytree=cb,
#                         seed=42)
    
#     xgb.fit(x_train_pca, y_train)
#     y_pre = xgb.predict_proba(x_val_pca)
#     score = log_loss(y_val, y_pre)
#     scores_cb.append(score)
    
#     print("log_loss:{}".format(score))

In [None]:
# plt.plot(colsample_bytrees, scores_cb, "o-")

# plt.xlabel("colsample_bytrees")
# plt.ylabel("log_loss")
# plt.show()

# print("optimal colsample_bytree:{}".format(colsample_bytrees[np.argmin(scores_cb)]))

In [None]:
xgb = XGBClassifier(max_depth=3,
                        learning_rate=0.1,
                        n_estimators=300,
                        objective="binary:logistic",
                        n_jobs=-1,
                        nthread=4,
                        min_child_weight=1,
                        subsample=0.5,
                        colsample_bytree=1,
                        seed=42)
xgb.fit(x_train_pca, y_train)

In [None]:
y_pre_proba1 = xgb.predict_proba(x_val_pca)
y_pre_proba1

In [None]:
log_loss(y_val, y_pre_proba1, eps=1e-15, normalize=True)

In [None]:
test.head()

In [None]:
test_drop_id = test.drop(["id"], axis=1)
test_drop_id.head()

In [None]:
scaler = StandardScaler()
scaler.fit(test_drop_id)

test_scaled = scaler.transform(test_drop_id)
test_val_scaled = scaler.transform(x_val)

In [None]:
pca = PCA(n_components=0.9)

test_pca = pca.fit_transform(x_train_scaled)

In [None]:
test_pre_proba = xgb.predict_proba(test_pca)
test_pre_proba

In [None]:
result = pd.DataFrame(test_pre_proba, columns=["Response_"+str(i) for i in range(0,2)])

In [None]:
result.head()

In [None]:
result.insert(loc=0, column="id", value=test.id)

In [None]:
result.to_csv('submission.csv', index = False)
result.head()