In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from scipy.optimize import minimize


In [2]:
train = pd.read_csv("train_ZoGVYWq.csv")
test = pd.read_csv("test_66516Ee.csv")
sample_submission = pd.read_csv("sample_submission_sLex1ul.csv")


In [3]:
y = train.renewal.values
train.drop(['id', 'renewal'], axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)


In [4]:
result = sample_submission
result.renewal = None
result.incentives = None
test_premia = test.premium

In [5]:
train.head()


Unnamed: 0,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,sourcing_channel,residence_area_type,premium
0,0.429,12058,355060,0.0,0.0,0.0,99.02,13,C,Urban,3300
1,0.01,21546,315150,0.0,0.0,0.0,99.89,21,A,Urban,18000
2,0.917,17531,84140,2.0,3.0,1.0,98.69,7,C,Rural,3300
3,0.049,15341,250510,0.0,0.0,0.0,99.57,9,A,Urban,9600
4,0.052,31400,198680,0.0,0.0,0.0,99.87,12,B,Urban,9600


In [6]:
train.fillna(train.mean(), inplace=True), test.fillna(test.mean(), inplace=True)

(None, None)

In [7]:
train['feat1'], test['feat1'] = train.no_of_premiums_paid/train.age_in_days, test.no_of_premiums_paid/test.age_in_days
train['feat4'], test['feat4'] = train.perc_premium_paid_by_cash_credit*train.premium, test.perc_premium_paid_by_cash_credit*test.premium
train['feat6'], test['feat6'] = train.premium*train.no_of_premiums_paid, test.premium*test.no_of_premiums_paid

In [8]:
train.head()

Unnamed: 0,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,sourcing_channel,residence_area_type,premium,feat1,feat4,feat6
0,0.429,12058,355060,0.0,0.0,0.0,99.02,13,C,Urban,3300,0.001078,1415.7,42900
1,0.01,21546,315150,0.0,0.0,0.0,99.89,21,A,Urban,18000,0.000975,180.0,378000
2,0.917,17531,84140,2.0,3.0,1.0,98.69,7,C,Rural,3300,0.000399,3026.1,23100
3,0.049,15341,250510,0.0,0.0,0.0,99.57,9,A,Urban,9600,0.000587,470.4,86400
4,0.052,31400,198680,0.0,0.0,0.0,99.87,12,B,Urban,9600,0.000382,499.2,115200


In [9]:
dummy_train = pd.get_dummies(train, drop_first=True)
dummy_test = pd.get_dummies(test, drop_first=True)


In [10]:
del train, test, sample_submission

In [11]:
all(dummy_train.columns == dummy_test.columns)

True

In [12]:
x = dummy_train.values
x_test = dummy_test.values

In [13]:
del dummy_train, dummy_test

In [14]:
e_soft = VotingClassifier(estimators=[('clf1', RandomForestClassifier(n_estimators=150, max_depth=16, bootstrap=True, min_samples_split=2, min_samples_leaf=10)), ('clf2', GradientBoostingClassifier(loss='exponential'))], voting='soft') 

In [15]:
skf = StratifiedKFold(10, random_state=1)
auc_scores = []
for train_idx, val_idx in skf.split(x, y):
    x_train, x_val = x[train_idx], x[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    model = e_soft

    model.fit(x_train, y_train)
    
    p_val = model.predict_proba(x_val)[:, 1]

    auc_score = roc_auc_score(y_val, p_val)
    auc_scores.append(auc_score)
    del x_train, y_train, x_val, y_val, p_val, auc_score
    
    

In [16]:
sum(auc_scores)/len(auc_scores)

0.8437181283080483

In [17]:
model.fit(x, y)

VotingClassifier(estimators=[('clf1', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=16, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_w...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [18]:
del e_soft

In [19]:
def get_hrs_inv(incentive):
    return 10*(1-np.exp(-incentive/400))

def get_improv_hrs(hours):
    return 20*(1-np.exp(-hours/5))

def get_improv(incentive):
    return get_improv_hrs(get_hrs_inv(incentive))*0.01
    

In [20]:
def get_inc_param2(proba, premia, param1, param2):
    inc = param1*premia + param2*proba
    tnr = (proba + get_improv(inc)) * premia - inc
    return sum(tnr)
    

In [21]:
def to_minimize(params):
    return -get_inc_param2(proba, premia, *params)

In [22]:
result.renewal = model.predict_proba(x_test)[:, 1]
proba = result.renewal

In [23]:
params = minimize(to_minimize, [1, 1])
result.incentives = params['x'][0]*premia + params['x'][1]*proba

NameError: name 'premia' is not defined

In [None]:
del model, premia, proba

In [None]:
result.incentives = np.maximum(0, result.incentives - 130)

In [None]:
result.to_csv("submission.csv", index=False)