| Column        | Meaning  | 
| ------------- |:-------------:|
|   id                   |   Unique ID for the customer   |
|   Gender               |   Gender of the customer   |
|   Age                  |   Age of the customer   |
|   Driving_License      |   0 : Customer does not have DL, 1 : Customer already has DL   |
|   Region_Code          |   Unique code for the region of the customer   |
|   Previously_Insured   |  1 : Customer already has Vehicle Insurance, 0 : Customer doesn't have Vehicle Insurance   |
|   Vehicle_Age          |   Age of the Vehicle   |
|   Vehicle_Damage       |   1 : Customer got his/her vehicle damaged in the past. 0 : Customer didn't get his/her vehicle damaged in the past.   |
|   Annual_Premium       |   The amount customer needs to pay as premium in the year   |
|   PolicySalesChannel   |  Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc.   |
|   Vintage              |   Number of Days, Customer has been associated with the company   |
|   Response             |   1 : Customer is interested, 0 : Customer is not interested   |

In [None]:
import umap
import pickle
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD, NMF, KernelPCA
from sklearn.neural_network import BernoulliRBM
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [None]:
train = pd.read_csv('../input/health-insurance-cross-sell-prediction/train.csv')
test = pd.read_csv('../input/health-insurance-cross-sell-prediction/test.csv')
train


# EDA

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
plt.figure(figsize = (20, 10))
plt.hist(x = train['Age'], bins = 40)    #, bins = 'fd')
plt.show()

Looks like in this company portfolio a lot of young clients

In [None]:
g = sns.FacetGrid(train, hue='Response', height = 7, aspect = 2)
g.map(sns.kdeplot, 'Age')
plt.legend()
plt.show()

But middle-age client are more interested 

In [None]:
g = sns.FacetGrid(train, hue='Gender', height = 7, aspect = 2)
g.map(sns.kdeplot, 'Age')
plt.legend()
plt.show()

In [None]:
g = sns.FacetGrid(train, hue='Vehicle_Age', height = 7, aspect = 2)
g.map(sns.kdeplot, 'Age')
plt.legend()
plt.show()

In [None]:
g = sns.FacetGrid(train, hue='Response', height = 7,  aspect = 2, xlim = (0, 125000))
g.map(sns.kdeplot, 'Annual_Premium')
plt.legend()
plt.show()

In [None]:
g = sns.FacetGrid(train, hue='Response', height = 5, aspect = 2)
g.map(sns.kdeplot, 'Vintage')
plt.legend()
plt.show()

In [None]:
sns.countplot(train['Response'])

In [None]:
sns.countplot(train['Driving_License'])

In [None]:
plt.figure(figsize=(12,8))
sns.violinplot(x="Driving_License", y="Age", hue="Response",
                    data=train, palette="muted", split=True)
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.violinplot(x="Gender", y="Age", hue="Response",
                    data=train, palette="muted", split=True)
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x='Age',y='Annual_Premium',data=train,hue='Response')
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(y='Vintage',x='Annual_Premium',data=train,hue='Response')
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(data=train, x = 'Gender', y = 'Age', orient='v')
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(data = train, x = 'Vehicle_Age', hue = 'Response')
plt.show()

In [None]:
sns.catplot(x="Age", y="Response", row="Vehicle_Damage",
                kind="box", orient="h", height=3, aspect=3,
                data=train)
plt.show()

In [None]:
channels_resp = train[['Policy_Sales_Channel', 'Response']].groupby(by = ['Policy_Sales_Channel']).mean()
print('10 Channels with the lowest responce rate:')
display(channels_resp.sort_values(by='Response',ascending=True).head(10))
print('10 Channels with the highest responce rate:')
display(channels_resp.sort_values(by='Response',ascending=False).head(10))

In [None]:
plt.figure(figsize=(12,8))
plt.hist(np.log(train['Annual_Premium']), bins = 40)
plt.show()

# Data preparation

In [None]:
allowed_regions = set(train['Region_Code'].value_counts()[train['Region_Code'].value_counts()>5000].index)

In [None]:
allowed_sales_channels = set(train['Policy_Sales_Channel'].value_counts()[train['Policy_Sales_Channel'].value_counts()>5000].index)

In [None]:
def uni_filter(x, filter_data):
    if x in filter_data:
        return x
    else:
        return 0

def region_filter(x):
    return uni_filter(x=x, filter_data=allowed_regions)

def sales_channel_filter(x):
    return uni_filter(x=x, filter_data=allowed_sales_channels)

def dummification(df):
    dummy_df = pd.DataFrame()
    object_cols = {'Gender', 'Region_Code', 'Vehicle_Age', 'Vehicle_Damage', 'Policy_Sales_Channel'}
    for col in object_cols:
        dummy_df = pd.concat([dummy_df, create_dummy(col, df)], axis = 1)
    
    return pd.concat([df.drop(columns = object_cols), dummy_df], axis = 1)
        

def create_dummy(col, df):
    df_dummy = pd.get_dummies(df[col], drop_first = True)
    df_dummy.columns = ['dum: ' + str(col) + ': ' + str(name) for name in df_dummy.columns]
    return df_dummy

def processing(df):
    df = df.copy()
    df.drop(['id'], axis=1, inplace = True)
    df['Annual_Premium'] = df['Annual_Premium'].apply(np.log)
    df['Region_Code'] = df['Region_Code'].apply(region_filter)
    df['Policy_Sales_Channel'] = df['Policy_Sales_Channel'].apply(sales_channel_filter)
    df = dummification(df)
    df.columns = [col_name.replace('<', 'less thn').replace('>', 'more thn') for col_name in df.columns]
    return df.sort_index(ascending=False, axis=1)

df = processing(train)
df

# Dimensionality reduction

### Scaling

In [None]:
scaler = MinMaxScaler()
X_sc = scaler.fit_transform(df.drop('Response', axis = 1))
y_v = df['Response'].values

print(f'X_sc shape: {X_sc.shape}')
print(f'y_v shape: {y_v.shape}')

### PCA

2 components:

In [None]:
pca = PCA(n_components=2, random_state = 1)
df_pca = pca.fit_transform(X_sc)

In [None]:
df_vis = pd.DataFrame(df_pca)
df_vis['y'] = y_v

plt.figure(figsize = (12, 8))
sns.scatterplot(data = df_vis, x = 0, y = 1, hue = 'y',  palette = 'magma')
plt.show()

In [None]:
pca_variance = pca.explained_variance_

plt.figure(figsize=(6, 6))
plt.bar(['0', '1'], pca_variance, align='center', label='individual variance')
plt.legend()
plt.ylabel('Variance ratio')
plt.xlabel('Principal components')
plt.show()

3 components:

In [None]:
pca = PCA(n_components=3, random_state = 1)
df_pca = pca.fit_transform(X_sc)

In [None]:
df_vis = pd.DataFrame(df_pca)
df_vis['y'] = y_v

fig = plt.figure(figsize = (15, 8))

ax = fig.add_subplot(1, 2, 1, projection='3d')
ax.scatter3D(df_vis[0], df_vis[1], df_vis[2], c=df_vis['y'])
ax.view_init(10, 10)

ax = fig.add_subplot(1, 2, 2, projection='3d')
ax.scatter3D(df_vis[0], df_vis[1], df_vis[2], c=df_vis['y'])
ax.view_init(20, 35)

plt.show()

In [None]:
pca_variance = pca.explained_variance_

plt.figure(figsize=(6, 6))
plt.bar(['0', '1', '2'], pca_variance, align='center', label='individual variance')
plt.legend()
plt.ylabel('Variance ratio')
plt.xlabel('Principal components')
plt.show()

### Singular Value Decomposition

2 components:

In [None]:
svd = TruncatedSVD(n_components=2, random_state = 1)
df_svd = svd.fit_transform(X_sc)

In [None]:
df_vis = pd.DataFrame(df_svd)
df_vis['y'] = y_v

plt.figure(figsize = (12, 8))
sns.scatterplot(data = df_vis, x = 0, y = 1, hue = 'y',  palette = 'magma')
plt.show()

3 components:

In [None]:
svd = TruncatedSVD(n_components=3, random_state = 1)
df_svd = svd.fit_transform(X_sc)

In [None]:
df_vis = pd.DataFrame(df_svd)
df_vis['y'] = y_v

fig = plt.figure(figsize = (15, 8))

ax = fig.add_subplot(1, 2, 1, projection='3d')
ax.scatter3D(df_vis[0], df_vis[1], df_vis[2], c=df_vis['y'])
ax.view_init(10, 10)

ax = fig.add_subplot(1, 2, 2, projection='3d')
ax.scatter3D(df_vis[0], df_vis[1], df_vis[2], c=df_vis['y'])
ax.view_init(20, 35)

plt.show()

### Non-Negative Matrix Factorization (NMF)

2 components:

In [None]:
nmf = NMF(n_components=2, random_state = 1)
df_nmf = nmf.fit_transform(X_sc)

In [None]:
df_vis = pd.DataFrame(df_nmf)
df_vis['y'] = y_v

plt.figure(figsize = (12, 8))
sns.scatterplot(data = df_vis, x = 0, y = 1, hue = 'y',  palette = 'magma')
plt.show()

3 components:

In [None]:
nmf = NMF(n_components=3, random_state = 1)
df_nmf = nmf.fit_transform(X_sc)

In [None]:
df_vis = pd.DataFrame(df_nmf)
df_vis['y'] = y_v

fig = plt.figure(figsize = (15, 8))

ax = fig.add_subplot(1, 2, 1, projection='3d')
ax.scatter3D(df_vis[0], df_vis[1], df_vis[2], c=df_vis['y'])
ax.view_init(10, 10)

ax = fig.add_subplot(1, 2, 2, projection='3d')
ax.scatter3D(df_vis[0], df_vis[1], df_vis[2], c=df_vis['y'])
ax.view_init(20, 35)

plt.show()

### Restricted Boltzmann Machine

In [None]:
rbm = BernoulliRBM(n_components=2, random_state = 1)
df_rbm = rbm.fit_transform(X_sc)

In [None]:
df_vis = pd.DataFrame(df_rbm)
df_vis['y'] = y_v

plt.figure(figsize = (12, 8))
sns.scatterplot(data = df_vis, x = 0, y = 1, hue = 'y',  palette = 'magma')
plt.show()

### t-distributed Stochastic Neighbor Embedding

In [None]:
tsne = TSNE(
    n_components=2, 
    random_state = 1,
    n_iter = 1000,
    n_jobs = -1
)
df_tsne = tsne.fit_transform(X_sc)

In [None]:
df_vis = pd.DataFrame(df_tsne)
df_vis['y'] = y_v

plt.figure(figsize = (12, 8))
sns.scatterplot(data = df_vis, x = 0, y = 1, hue = 'y',  palette = 'magma')
plt.show()

### UMAP (Uniform Manifold Approximation and Projection)

In [None]:
umap_m = umap.UMAP(
    n_components=2, 
    random_state=1, 
    n_neighbors=5
)

df_umap = umap_m.fit_transform(X_sc)

In [None]:
df_vis = pd.DataFrame(df_umap)
df_vis['y'] = y_v

plt.figure(figsize = (12, 8))
sns.scatterplot(data = df_vis, x = 0, y = 1, hue = 'y',  palette = 'magma')
plt.show()

We can see that most part of used dimensionality reduction methods give us zones with high rate of interested clients

# Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Response', axis = 1), 
                                                    df['Response'], 
                                                    test_size=0.3,
                                                   random_state = 101)


# Modeling

In [None]:
def eval_result(model, X_test, y_test):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        pred = model.predict(X_test)
        print(classification_report(y_test, pred, target_names = ['Not interested', 'Interested']))
        display(pd.DataFrame(confusion_matrix(y_test, pred), 
                         columns = ['Predicted Not interested', 'Predicted Interested'],
                         index = ['Not interested', 'Interested']))
    
        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
        scores = cross_val_score(model, X_test, y_test, scoring='roc_auc', cv=cv, n_jobs=-1)
        print('Mean ROC AUC: %.3f' % np.mean(scores))
        
        if hasattr(model, 'feature_importances_'):
            features = pd.DataFrame({
                'Variable'  :X_train.columns,
                'Importance':model.feature_importances_
            })
            features.sort_values('Importance', ascending=False, inplace=True)
            display(features.head(20))

In [None]:
lgmodel = LogisticRegression(
    solver='lbfgs', 
    class_weight={0:1.0, 1:2.3},
    n_jobs = -1,
    random_state = 101
)
lgmodel.fit(X_train, y_train)
eval_result(lgmodel, X_test, y_test)

In [None]:
dtc = DecisionTreeClassifier(random_state = 101)
dtc.fit(X_train, y_train)
eval_result(dtc, X_test, y_test)

In [None]:
rfc = RandomForestClassifier(random_state = 101, n_jobs = -1)
rfc.fit(X_train, y_train)
eval_result(rfc, X_test, y_test)

In [None]:
xgbr = XGBClassifier(
    random_state = 1,
    n_jobs = -1,
    scale_pos_weight = 2,
    eval_metric = 'logloss'
)
xgbr.fit(X_train, y_train)
eval_result(xgbr, X_test, y_test)

# Commit

In [None]:
model = xgbr

In [None]:
df_test = processing(test)
df_test

In [None]:
predictions = model.predict(df_test) 
predictions

In [None]:
answer = pd.DataFrame(data = {'id': test['id'], 'Response':predictions})
answer.to_csv('insurance sumb.csv', index=False)
answer

In [None]:
answer['Response'].value_counts()