### Import Packages

In [None]:
import pandas as pd
import numpy as np
import math
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier
import xgboost as xgb

from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from scipy.optimize import fmin_tnc, differential_evolution, fmin_l_bfgs_b
import pymonad


In [4]:
data_dir = "data/"

In [5]:
os.listdir(data_dir)

['.DS_Store', 'sample_submission.csv', 'test.csv', 'train.csv']

In [6]:
df = pd.read_csv(data_dir + 'train.csv')

In [7]:
test_df = pd.read_csv(data_dir + 'test.csv')
samp_df = pd.read_csv(data_dir + 'sample_submission.csv')

In [8]:
df.head()

Unnamed: 0,id,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,sourcing_channel,residence_area_type,premium,renewal
0,110936,0.429,12058,355060,0.0,0.0,0.0,99.02,13,C,Urban,3300,1
1,41492,0.01,21546,315150,0.0,0.0,0.0,99.89,21,A,Urban,18000,1
2,31300,0.917,17531,84140,2.0,3.0,1.0,98.69,7,C,Rural,3300,0
3,19415,0.049,15341,250510,0.0,0.0,0.0,99.57,9,A,Urban,9600,1
4,99379,0.052,31400,198680,0.0,0.0,0.0,99.87,12,B,Urban,9600,1


In [9]:
test_df.head()

Unnamed: 0,id,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,sourcing_channel,residence_area_type,premium
0,649,0.001,27384,51150,0.0,0.0,0.0,99.89,7,A,Rural,3300
1,81136,0.124,23735,285140,0.0,0.0,0.0,98.93,19,A,Urban,11700
2,70762,1.0,17170,186030,0.0,0.0,0.0,,2,B,Urban,11700
3,53935,0.198,16068,123540,0.0,0.0,0.0,99.0,11,B,Rural,5400
4,15476,0.041,10591,200020,1.0,0.0,0.0,99.17,14,A,Rural,9600


In [10]:
df["renewal"].value_counts()

1    74855
0     4998
Name: renewal, dtype: int64

In [11]:
class_wt = 4998.0/74855

In [12]:
df[df["renewal"]==0]['no_of_premiums_paid'].describe()

count    4998.000000
mean       10.410364
std         5.784630
min         2.000000
25%         6.000000
50%        10.000000
75%        14.000000
max        59.000000
Name: no_of_premiums_paid, dtype: float64

In [13]:
df[df["renewal"]==1]['no_of_premiums_paid'].describe()

count    74855.000000
mean        10.894169
std          5.125691
min          2.000000
25%          7.000000
50%         10.000000
75%         14.000000
max         60.000000
Name: no_of_premiums_paid, dtype: float64

In [14]:
df['no_of_premiums_paid'].describe()

count    79853.000000
mean        10.863887
std          5.170687
min          2.000000
25%          7.000000
50%         10.000000
75%         14.000000
max         60.000000
Name: no_of_premiums_paid, dtype: float64

In [15]:
df['age_in_years'] = df['age_in_days'].apply(lambda x: x/365)

In [16]:
test_df['age_in_years'] = test_df['age_in_days'].apply(lambda x: x/365)

In [17]:
df.dropna().shape, df.shape

((76855, 14), (79853, 14))

In [18]:
df['age_in_years'].describe()

count    79853.000000
mean        51.607404
std         14.270484
min         21.000000
25%         41.000000
50%         51.000000
75%         62.000000
max        103.000000
Name: age_in_years, dtype: float64

In [19]:
df1 = pd.get_dummies(df)

In [20]:
test_df1 = pd.get_dummies(test_df)

In [21]:
test_df1.shape

(34224, 18)

In [22]:
df1.shape

(79853, 19)

In [25]:
df1.columns

Index([u'id', u'perc_premium_paid_by_cash_credit', u'age_in_days', u'Income',
       u'Count_3-6_months_late', u'Count_6-12_months_late',
       u'Count_more_than_12_months_late', u'application_underwriting_score',
       u'no_of_premiums_paid', u'premium', u'renewal', u'age_in_years',
       u'sourcing_channel_A', u'sourcing_channel_B', u'sourcing_channel_C',
       u'sourcing_channel_D', u'sourcing_channel_E',
       u'residence_area_type_Rural', u'residence_area_type_Urban'],
      dtype='object')

In [30]:
# feature_cols = df1.columns - ['id', 'renewal', 'age_in_days', 'Income', 'premium_log', 'age_bin', 'age_in_years']
feature_cols = list(set(df1.columns) - set(['id', 'renewal', 'age_in_days']))

In [31]:
feature_cols

['premium',
 'sourcing_channel_E',
 'sourcing_channel_D',
 'age_in_years',
 'sourcing_channel_A',
 'sourcing_channel_C',
 'sourcing_channel_B',
 'application_underwriting_score',
 'residence_area_type_Urban',
 'Count_6-12_months_late',
 'Income',
 'Count_more_than_12_months_late',
 'Count_3-6_months_late',
 'perc_premium_paid_by_cash_credit',
 'no_of_premiums_paid',
 'residence_area_type_Rural']

In [24]:
# xgb1 = XGBClassifier(
#     learning_rate =0.1, n_estimators=1000,
#     max_depth=8, min_child_weight=1, gamma=0,subsample=0.8,colsample_bytree=0.8,
#     objective= 'binary:logistic', n_jobs=4,scale_pos_weight=1,seed=27, missing=np.nan)

In [25]:
# xgb1.fit(train_df[feature_cols], train_df['renewal'])

In [26]:
# params = {'eta1': 0.02, 'max_depth': 5, 'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'binary:logistic', 'seed': 99, 'silent': 0, 'eval_metric':'auc', 'n_jobs':4}


In [27]:
#  scale_pos_weight=class_wt

In [35]:
xgb1 = XGBClassifier(
    learning_rate=0.01, 
    n_estimators=400,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.7,
    colsample_bytree=0.7,
    objective='binary:logistic',
    n_jobs=-1,
    seed=99, 
#     scale_pos_weight=class_wt,
    missing=np.nan, 
    nthread=4, 
    eval_metric='auc'
)

In [36]:
# A parameter grid for XGBoost
random_grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.7, 0.8],
        'colsample_bytree': [0.6, 0.7, 0.8],
        'max_depth': [5, 6, 7]
        }

In [None]:
xgb_random = RandomizedSearchCV(estimator = xgb1, param_distributions = random_grid, n_iter = 20, cv = 3, verbose=2, random_state=42, n_jobs = -1, scoring='roc_auc')
xgb_random.fit(df1[feature_cols], df1["renewal"])

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] subsample=0.7, colsample_bytree=0.6, max_depth=6, gamma=1.5, min_child_weight=10 
[CV] subsample=0.7, colsample_bytree=0.6, max_depth=6, gamma=1.5, min_child_weight=10 
[CV] subsample=0.7, colsample_bytree=0.6, max_depth=6, gamma=1.5, min_child_weight=10 
[CV] subsample=0.8, colsample_bytree=0.7, max_depth=5, gamma=2, min_child_weight=1 
[CV]  subsample=0.8, colsample_bytree=0.7, max_depth=5, gamma=2, min_child_weight=1, total=  31.6s
[CV] subsample=0.8, colsample_bytree=0.7, max_depth=5, gamma=2, min_child_weight=1 
[CV]  subsample=0.7, colsample_bytree=0.6, max_depth=6, gamma=1.5, min_child_weight=10, total=  32.8s
[CV]  subsample=0.7, colsample_bytree=0.6, max_depth=6, gamma=1.5, min_child_weight=10, total=  32.8s
[CV] subsample=0.8, colsample_bytree=0.7, max_depth=5, gamma=2, min_child_weight=1 
[CV] subsample=0.7, colsample_bytree=0.8, max_depth=5, gamma=5, min_child_weight=10 
[CV]  subsample=0.7, colsample_bytree=

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.8min


[CV]  subsample=0.6, colsample_bytree=0.6, max_depth=7, gamma=1.5, min_child_weight=10, total=  38.1s
[CV]  subsample=0.6, colsample_bytree=0.6, max_depth=7, gamma=1.5, min_child_weight=10, total=  38.2s
[CV] subsample=0.7, colsample_bytree=0.8, max_depth=6, gamma=0.5, min_child_weight=5 
[CV] subsample=0.6, colsample_bytree=0.8, max_depth=6, gamma=5, min_child_weight=1 
[CV]  subsample=0.7, colsample_bytree=0.8, max_depth=6, gamma=0.5, min_child_weight=5, total=  38.9s
[CV]  subsample=0.7, colsample_bytree=0.8, max_depth=6, gamma=0.5, min_child_weight=5, total=  39.4s
[CV] subsample=0.6, colsample_bytree=0.8, max_depth=6, gamma=5, min_child_weight=1 
[CV] subsample=0.6, colsample_bytree=0.8, max_depth=6, gamma=5, min_child_weight=1 
[CV]  subsample=0.7, colsample_bytree=0.8, max_depth=6, gamma=0.5, min_child_weight=5, total=  38.9s
[CV] subsample=0.8, colsample_bytree=0.7, max_depth=7, gamma=0.5, min_child_weight=1 
[CV]  subsample=0.6, colsample_bytree=0.8, max_depth=6, gamma=5, min_

In [None]:
xgb_random.best_params_

In [None]:
xgb_random.best_score_

In [None]:
n_folds = 5
early_stopping = 50

xg_train = xgb.DMatrix(df1[feature_cols], label=df1["renewal"])
cv = xgb.cv(xgb1.get_params(), xg_train, 2000, nfold=n_folds, early_stopping_rounds=early_stopping, verbose_eval=1)

In [None]:
cv.shape[0]

In [None]:
# df["boost_pred"] = xgb1.predict(val_df[feature_cols])
xgb1.set_params(n_estimators=cv.shape[0])
# xgb1.set_params(n_estimators=390)

In [None]:
xgb1.fit(df1[feature_cols], df1["renewal"], eval_metric="auc")

In [None]:
# train_pred = xgb1.predict_proba(train_df[feature_cols])
# val_pred = xgb1.predict_proba(val_df[feature_cols])
# train_renewal_prob = map(lambda p: p[1], train_pred)
# val_renewal_prob = map(lambda p: p[1], val_pred)

In [None]:
test_pred = xgb1.predict_proba(test_df1[feature_cols])
test_df1["renewal"] = map(lambda p: p[1], test_pred)

In [None]:
# val_auc = roc_auc_score(val_df["renewal"], val_renewal_prob)
# train_auc = roc_auc_score(train_df["renewal"], train_renewal_prob)
# val_auc, train_auc

In [None]:
def inc_to_delta_prob(inc):
    return effort_to_delta_prob(inc_to_efforts(inc))

def inc_to_efforts(inc):
    return 10*(1 - math.exp(-inc*1.0/400))

def effort_to_delta_prob(effort):
    return 20*(1 - math.exp(-effort*1.0/5))

In [42]:
eps = 1e-6

@pymonad.curry
def grad_rev_inc(prem, x):
    return -(40*prem*math.exp(-2*x) - 400.0/(1-x))

@pymonad.curry
def rev_inc(prem, x):
    val = -(20*prem*(1 - math.exp(-2*x)) + 400*math.log(1 - x))
    return val

def get_incentive(effort):
    return -400*math.log(1-effort/10.0)

def get_incentive_for_policy(prob_renewal, premium):
    max_delta_p = 100*(1 - prob_renewal)
    max_delta_p = min(max_delta_p, 20-eps)
    incentive = 0
    if max_delta_p > 0:
        max_effort_needed = -5*math.log(1 - max_delta_p/20.0)
        max_effort_needed = min(max_effort_needed, 10-eps)
        bounds = [(eps, max_effort_needed/10.0)]
        res = fmin_l_bfgs_b(rev_inc(premium), 0.2, fprime=grad_rev_inc(premium), bounds=bounds, pgtol=1e-8)
        effort = res[0][0]*10
        incentive = get_incentive(effort)
        if max_delta_p * premium > incentive:
            return incentive
    return 0

In [None]:
# test_df1

In [43]:
test_df1['incentives1'] = test_df1.apply(lambda row: get_incentive_for_policy(row["renewal"], row["premium"]), axis=1)

In [53]:
samp_df.head(2)

In [45]:
test_df1[["id", "pred", "incentives"]].shape

In [52]:
test_df1 = test_df1.rename(columns={"pred": "renewal"})

In [None]:
sum(test_df1["incentives"])

In [65]:
test_df1["incentives"] = test_df1.apply(lambda x: cal_incentive(x["premium"], x["renewal"]), axis=1)

In [None]:
# test_df1["renewal"].value_counts()

In [68]:
# sum(test_df1["incentives"])

16823761.301665306

In [67]:
test_df1["incentives"].describe()

count    34224.000000
mean       491.577878
std        776.909615
min          7.015014
25%         97.568250
50%        223.291773
75%        559.413922
max      12000.000000
Name: incentives, dtype: float64

In [48]:
# test_df1["incentives1"].describe()

count    34224.000000
mean       206.079236
std        500.749673
min          5.616118
25%         13.096467
50%         28.275978
75%         85.262627
max       2680.788310
Name: incentives1, dtype: float64

In [56]:
# test_df1["incentives"] = test_df1["incentives1"] 

In [76]:
test_df1["incentives"] = test_df1["incentives"].apply(lambda x: 100 if x > 100 else x)

In [77]:
sum(test_df1["incentives"])

3039271.4862525463

In [78]:
test_df1["incentives"].describe()

count    34224.000000
mean        88.805268
std         23.077639
min          7.015014
25%         97.568250
50%        100.000000
75%        100.000000
max        100.000000
Name: incentives, dtype: float64

In [80]:
test_df1[["id", "renewal", "incentives"]].to_csv("submission_july_23_9.csv", index=False)

In [74]:
sub_df = pd.read_csv("submission_july_23_8.csv")

In [75]:
sum(sub_df["incentives"])

12345137.544122923