https://datahack.analyticsvidhya.com/contest/mckinsey-analytics-online-hackathon-4/

Part 1 
--
Level 1 models: Uses 8 orginal features and three derived features to build Decision Tree, Random Forest, Boosted trees (XGBoost) and Logistic regression (balanced by class weights). Tried SVM and KNN, they take too long to fine tune.

Ensemble model: Hyper-parameter tuned XGBoost model with all the original features, 5 derived features and level 1 predictions as inputs. 

Part 2
--
Basin hopping for optimization of the cost function

# Part 1 - Predicting Insurance Renewal

In [43]:
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin

In [44]:
from sklearn.model_selection import cross_val_score

In [45]:
train = pd.read_csv("/home/sanjay/Downloads/chk/train_ZoGVYWq.csv")
test = pd.read_csv("/home/sanjay/Downloads/chk/test_66516Ee.csv")

## Missing indicator variables

In [46]:
train['Count_missing'] = np.where(train['Count_3-6_months_late'].isnull(), 'yes', 'no')
test['Count_missing'] = np.where(test['Count_3-6_months_late'].isnull(), 'yes', 'no')
train['application_underwriting_score_missing'] = np.where(train['application_underwriting_score'].isnull(), 'yes', 'no')
test['application_underwriting_score_missing'] = np.where(test['application_underwriting_score'].isnull(), 'yes', 'no')

In [47]:
train['Perc_3-6_months_late'] = train['Count_3-6_months_late']/train['no_of_premiums_paid']
train['Perc_6-12_months_late'] = train['Count_6-12_months_late']/train['no_of_premiums_paid']
train['Perc_more_than_12_months_late'] = train['Count_more_than_12_months_late']/train['no_of_premiums_paid']

In [48]:
test['Perc_3-6_months_late'] = test['Count_3-6_months_late']/test['no_of_premiums_paid']
test['Perc_6-12_months_late'] = test['Count_6-12_months_late']/test['no_of_premiums_paid']
test['Perc_more_than_12_months_late'] = test['Count_more_than_12_months_late']/test['no_of_premiums_paid']

## Preprocessing

In [49]:
selected_cols = set(train.columns) - set(['renewal','id'])

In [50]:
numeric_cols = list(set(list(train._get_numeric_data())).intersection(selected_cols))
categorical_cols = list(selected_cols - set(numeric_cols))

In [51]:
class DFMissingNum(TransformerMixin):
    '''
    Replaces missing values by input value or method.Below are the methods available. 
    'mean': replace missing values using the mean.
    'median': replace missing values using the median
    'most_frequent': replace missing values using the mode
    'backfill' or 'bfill': use NEXT valid observation to fill gap.
    'pad' or 'ffill': propagate last valid observation forward to next valid.
    Numeric value: Replaces with the input value
    Ex: repalce = ""mean"" for replacing with mean, replace = 0 for replacing with the numeric 0
    Note: No quotes for numeric values
    '''
    def __init__(self,replace):
        self.replace = replace
        self.imp = None
        self.statistics_ = None        
        
    def fit(self,X,y=None): 
         
        if type(self.replace) == dict:
            for key, value in self.replace.iteritems():
                if value in ['mean','median','most_frequent']:
                    self.replace[key] = _Impute(value= value,S=X[key])
            
        elif self.replace in ['mean','median','most_frequent']:
            self.imp = DFImputer(strategy=self.replace)
            self.imp.fit(X)
            self.statistics_ = pd.Series(self.imp.statistics_, index=X.columns)
        return self    
    
    def transform(self,X):
        if self.replace in ['mean','median','most_frequent']:
            Ximp = self.imp.transform(X)
            X_replaced = pd.DataFrame(Ximp, index=X.index, columns=X.columns)
        
        elif self.replace in ['backfill','bfill','pad','ffill']:
            X_replaced = X.fillna(method=self.replace)
        
        elif type(self.replace) == dict:
            X_replaced = X.copy()
            for key, value in self.replace.iteritems():
                if value in ['backfill','bfill','pad','ffill']:
                    X_replaced[key] = X_replaced[key].fillna(method=value)
                else:
                    X_replaced[key] = X_replaced[key].fillna(value=value)
        else:
            X_replaced = X.fillna(value=self.replace)
        return X_replaced

class DFMissingStr(TransformerMixin):
    '''
    METHODS
    most_frequent:
    backfill/bfill:
    pad/ffill:
    '''
    def __init__(self,replace):
        self.replace = replace
        self.statistics_ = None
    def fit(self,X,y=None):
        
        if type(self.replace) == dict:
            for key, value in self.replace.iteritems():
                if value == 'most_frequent':
                    self.replace[key] = X[key].mode()[0]
        elif self.replace =='most_frequent':
            self.statistics_= X.mode().to_dict()
            for key,value in self.statistics_.items():
                self.statistics_[key] = value.values()[0]

        return self    
    def transform(self,X):
        if self.replace == 'most_frequent':
            X_replaced= X.fillna(self.statistics_)
        
        elif self.replace in ['backfill','bfill','pad','ffill']:
            X_replaced = X.fillna(method=self.replace)
            
        elif type(self.replace) == dict:
            X_replaced = X.copy()
            for key, value in self.replace.iteritems():
                if value in ['backfill','bfill','pad','ffill']:
                    X_replaced[key] = X_replaced[key].fillna(method=value)
                else:
                    X_replaced[key] = X_replaced[key].fillna(value=value)
        else:
            X_replaced = X.fillna(value=self.replace)
        return X_replaced

class DFOneHot(TransformerMixin):
    '''
    dummy_na: Unseeen values Boolean
    reference - https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html
    '''
    
    def __init__(self,dummy_na=False):
        self.dummy_na=dummy_na
        self.categories_ = {}
    
    def fit(self, X, y=None):
        for col in list(X):
            self.categories_[col] = X[col].unique()
        return self
    
    def transform(self,X):
        X_new = X.copy()
        for colname, levels in self.categories_.iteritems():
            X_new[colname][~X_new[colname].isin(levels)] = np.NaN
        X_dummy = pd.get_dummies(X_new,dummy_na=self.dummy_na)
        
        fit_colnames = []
        for colname, levels in self.categories_.iteritems():
            for level in levels:
                fit_colnames.append(str(colname)+'_'+str(level))
        
        X_dummy = X_dummy.reindex(columns= fit_colnames, fill_value= 0)
        
        return X_dummy


class ColumnExtractor(TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xcols = pd.DataFrame(X[self.cols])
        return Xcols

class DFFeatureUnion(TransformerMixin):
    # FeatureUnion but for pandas DataFrames

    def __init__(self, transformer_list):
        self.transformer_list = transformer_list
        print "Feature union successfully initiated."

    def fit(self, X, y=None):
        for (name, t) in self.transformer_list:
            t.fit(X, y)

        print "Feature union - successful fit."
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xts = [t.transform(X) for _, t in self.transformer_list]
        Xunion = reduce(lambda X1, X2: pd.merge(X1, X2, left_index=True, right_index=True), Xts)
        print "Feature union - successful transform."
        return Xunion

from sklearn.preprocessing import Imputer
class DFImputer(TransformerMixin):
    # Imputer but for pandas DataFrames

    def __init__(self, strategy='mean'):
        self.strategy = strategy
        self.imp = None
        self.statistics_ = None

    def fit(self, X, y=None):
        self.imp = Imputer(strategy=self.strategy)
        self.imp.fit(X)
        self.statistics_ = pd.Series(self.imp.statistics_, index=X.columns)
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Ximp = self.imp.transform(X)
        Xfilled = pd.DataFrame(Ximp, index=X.index, columns=X.columns)
        return Xfilled


from sklearn.pipeline import Pipeline



In [52]:
preprocess = Pipeline([("features",DFFeatureUnion([
    ("numeric",Pipeline([("num_sel",ColumnExtractor(numeric_cols)),("num_impute",DFMissingNum(replace='median'))])),
    ("categorical",Pipeline([("cat_sel",ColumnExtractor(categorical_cols)),("str_impute",DFMissingStr(replace='most_frequent')),("one_hot",DFOneHot())]))
]))])

raw_train = preprocess.fit_transform(train)

raw_test = preprocess.transform(test)

Feature union successfully initiated.
Feature union - successful fit.
Feature union - successful transform.
Feature union - successful transform.


In [53]:
# Selected based on feature importance from a baseline model
round1_features = ['age_in_days',
'Income',
'application_underwriting_score',
'perc_premium_paid_by_cash_credit',
'no_of_premiums_paid',
'Count_3-6_months_late',
'Count_6-12_months_late',
'Count_more_than_12_months_late',
'Perc_3-6_months_late',
'Perc_6-12_months_late',
'Perc_more_than_12_months_late']

In [54]:
processed_train = raw_train[round1_features]
processed_test = raw_test[round1_features]

# Modeling

In [55]:
X = processed_train.values

In [56]:
y = train['renewal']

## Decision Tree

In [57]:
from sklearn.tree import DecisionTreeClassifier

tree_cls = DecisionTreeClassifier(max_depth=2)
# scores = cross_val_score(tree_cls, X, y, scoring='roc_auc', cv=10)
# print("AUC: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [58]:
tree_cls.fit(X,y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [59]:
# for name, importance in zip(processed_train.columns, tree_cls.feature_importances_):
#     print(name, importance)

## Random Forest

In [60]:
from sklearn.ensemble import RandomForestClassifier
# rf_cls = RandomForestClassifier(max_depth=2, random_state=0, class_weight="balanced")

In [61]:
# scores = cross_val_score(rf_cls, X, y, scoring='roc_auc', cv=10)
# print("AUC: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [62]:
rf_cls = RandomForestClassifier(**{'bootstrap': True,
 'max_depth': 2,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 200})

In [63]:
# scores = cross_val_score(rf_cls, X, y, scoring='roc_auc', cv=10)
# print("AUC: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [64]:
rf_cls.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [65]:
# for name, importance in zip(processed_train.columns, rf_cls.feature_importances_):
#     print(name, importance)

## Boosted Trees

In [66]:
from xgboost import XGBClassifier
xgb_cls = XGBClassifier(max_depth=3, n_estimators=200)

# scores = cross_val_score(xgb_cls, X, y, scoring='roc_auc', cv=10)
# print("AUC: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [67]:
xgb_cls.fit(X,y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [68]:
# for name, importance in zip(processed_train.columns, xgb_cls.feature_importances_):
#     print(name, importance)

## Logistic Regression

In [69]:
from sklearn.linear_model import LogisticRegression
logistic_cls = LogisticRegression(class_weight="balanced")

In [70]:
logistic_cls.fit(X,y)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [71]:
# scores = cross_val_score(logistic_cls, X, y, scoring='roc_auc', cv=10)
# print("AUC: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

## SVM Classifier -- takes too long

In [72]:
# from sklearn.svm import SVC

In [73]:
# svm_cls = SVC(class_weight="balanced", probability=True)

In [74]:
# scores = cross_val_score(svm_cls, X, y, scoring='roc_auc', cv=3)
# print("AUC: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

## Ensemble

In [75]:
# tree_cls.fit(X,y)
tree_predictions_train = [x[1] for x in tree_cls.predict_proba(processed_train.values)]
tree_predictions_test = [x[1] for x in tree_cls.predict_proba(processed_test.values)]

# rf_cls.fit(X,y)
rf_predictions_train = [x[1] for x in rf_cls.predict_proba(processed_train.values)]
rf_predictions_test = [x[1] for x in rf_cls.predict_proba(processed_test.values)]

# xgb_cls.fit(X, y)
xgb_predictions_train = [x[1] for x in xgb_cls.predict_proba(processed_train.values)]
xgb_predictions_test = [x[1] for x in xgb_cls.predict_proba(processed_test.values)]

# logistic_cls.fit(X,y)
logistic_predictions_train = [x[1] for x in logistic_cls.predict_proba(processed_train.values)]
logistic_predictions_test = [x[1] for x in logistic_cls.predict_proba(processed_test.values)]

# svm_cls.fit(X,y)
# svm_predictions_train = [x[1] for x in svm_cls.predict_proba(processed_train.values)]
# svm_predictions_test = [x[1] for x in svm_cls.predict_proba(processed_test.values)]

In [76]:
raw_train['tree_pred'] = tree_predictions_train
raw_train['rf_pred'] = rf_predictions_train
raw_train['xgb_pred'] = xgb_predictions_train
raw_train['logistic_pred'] = logistic_predictions_train
# raw_train['svm_pred'] = svm_predictions_train

In [77]:
raw_test['tree_pred'] = tree_predictions_test
raw_test['rf_pred'] = rf_predictions_test
raw_test['xgb_pred'] = xgb_predictions_test
raw_test['logistic_pred'] = logistic_predictions_test
# raw_test['svm_pred'] = svm_predictions_test

In [78]:
raw_train.to_csv('/home/sanjay/Downloads/chk/train_after_ens_sat.csv', index= False)
raw_test.to_csv('/home/sanjay/Downloads/chk/test_after_ens_sat.csv', index= False)

In [79]:
X = raw_train.values

In [80]:
scores = cross_val_score(xgb_cls, X, y, scoring='roc_auc', cv=10)
print("AUC: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

AUC: 0.86 (+/- 0.01)


In [81]:
scores

array([0.85925073, 0.86298945, 0.84715896, 0.86313425, 0.86382821,
       0.84973707, 0.85820668, 0.86121136, 0.85579415, 0.84354721])

In [None]:
### Hyperparameter tuning

In [82]:
##### Iterative process - tune alpha (first model was overfitting); fix learning rate; try more complex model 

In [119]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot

# grid search - alpha
model = XGBClassifier(max_depth=4, n_estimators=400, learning_rate=0.01)
# n_estimators = [50, 100, 150, 200]
# max_depth = [2, 4, 6, 8]
reg_alpha=[1e-2, 0.1, 0.5, 0.8, 1, 1.5, 2, 5, 10]

param_grid = dict(reg_alpha=reg_alpha)
kfold = KFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, n_jobs=-1, cv=kfold, verbose=1, scoring='roc_auc')
grid_result = grid_search.fit(X, y)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 18.2min finished


Best: 0.857145 using {'reg_alpha': 5}
0.856959 (0.008124) with: {'reg_alpha': 0.01}
0.856970 (0.008124) with: {'reg_alpha': 0.1}
0.856995 (0.008052) with: {'reg_alpha': 0.5}
0.856945 (0.008106) with: {'reg_alpha': 0.8}
0.856962 (0.008213) with: {'reg_alpha': 1}
0.857073 (0.008166) with: {'reg_alpha': 1.5}
0.857115 (0.008200) with: {'reg_alpha': 2}
0.857145 (0.008234) with: {'reg_alpha': 5}
0.857119 (0.008316) with: {'reg_alpha': 10}


In [120]:
grid_result.best_params_

{'reg_alpha': 5}

In [85]:
xgb_cls = XGBClassifier(max_depth=3, n_estimators=200, reg_alpha=1)
xgb_cls.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=1, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [109]:
xgb_cls = XGBClassifier(max_depth=3, n_estimators=400, reg_alpha=1, learning_rate=0.01)
scores = cross_val_score(xgb_cls, X, y, scoring='roc_auc', cv=10)
print("AUC: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

AUC: 0.86 (+/- 0.01)


In [110]:
# grid search - tree parameters
model = XGBClassifier(reg_alpha=1, learning_rate=0.01)
n_estimators = [50, 100, 150, 200, 400]
max_depth = [2, 4, 6, 8]
# reg_alpha=[1e-5, 1e-2, 0.1, 1, 100]

param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)
kfold = KFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, n_jobs=-1, cv=kfold, verbose=1, scoring='roc_auc')
grid_result = grid_search.fit(X, y)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 35.3min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 40.5min finished


Best: 0.856962 using {'n_estimators': 400, 'max_depth': 4}
0.837788 (0.014289) with: {'n_estimators': 50, 'max_depth': 2}
0.842172 (0.009454) with: {'n_estimators': 100, 'max_depth': 2}
0.850623 (0.007930) with: {'n_estimators': 150, 'max_depth': 2}
0.853883 (0.008437) with: {'n_estimators': 200, 'max_depth': 2}
0.856314 (0.008258) with: {'n_estimators': 400, 'max_depth': 2}
0.853347 (0.007586) with: {'n_estimators': 50, 'max_depth': 4}
0.855450 (0.007739) with: {'n_estimators': 100, 'max_depth': 4}
0.856421 (0.007928) with: {'n_estimators': 150, 'max_depth': 4}
0.856787 (0.008123) with: {'n_estimators': 200, 'max_depth': 4}
0.856962 (0.008213) with: {'n_estimators': 400, 'max_depth': 4}
0.853638 (0.007860) with: {'n_estimators': 50, 'max_depth': 6}
0.855744 (0.007910) with: {'n_estimators': 100, 'max_depth': 6}
0.856326 (0.007952) with: {'n_estimators': 150, 'max_depth': 6}
0.856485 (0.007921) with: {'n_estimators': 200, 'max_depth': 6}
0.856037 (0.008348) with: {'n_estimators': 400, 

In [121]:
xgb_cls = XGBClassifier(max_depth=4, n_estimators=400, reg_alpha=5, learning_rate=0.01)
xgb_cls.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=400,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=5, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [122]:
predictions = xgb_cls.predict_proba(raw_test.values)
predictions_val = xgb_cls.predict(raw_test.values)

final_predictions = [x[1] for x in predictions]

processed_test['id'] = test['id']
df_out = raw_test.copy()
df_out['renewal'] = final_predictions
df_out['guess'] = predictions_val

  if diff:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [123]:
df_out.to_csv('/home/sanjay/Downloads/chk/predictions_baseline_new_sat_2.csv', index= False)

In [126]:
# df_out.head()

Unnamed: 0,Perc_more_than_12_months_late,premium,age_in_days,Perc_3-6_months_late,application_underwriting_score,Count_6-12_months_late,Income,Count_more_than_12_months_late,Count_3-6_months_late,perc_premium_paid_by_cash_credit,...,sourcing_channel_D,sourcing_channel_E,application_underwriting_score_missing_no,application_underwriting_score_missing_yes,tree_pred,rf_pred,xgb_pred,logistic_pred,renewal,guess
0,0.0,3300.0,27384.0,0.0,99.89,0.0,51150.0,0.0,0.0,0.001,...,0,0,1,0,0.974798,0.963705,0.992822,0.884125,0.98569,1
1,0.0,11700.0,23735.0,0.0,98.93,0.0,285140.0,0.0,0.0,0.124,...,0,0,1,0,0.974798,0.963705,0.983847,0.765156,0.978429,1
2,0.0,11700.0,17170.0,0.0,99.21,0.0,186030.0,0.0,0.0,1.0,...,0,0,0,1,0.888877,0.922661,0.811631,0.456192,0.728201,1
3,0.0,5400.0,16068.0,0.0,99.0,0.0,123540.0,0.0,0.0,0.198,...,0,0,1,0,0.974798,0.963021,0.974434,0.718838,0.967882,1
4,0.0,9600.0,10591.0,0.071429,99.17,0.0,200020.0,0.0,1.0,0.041,...,0,0,1,0,0.974798,0.954307,0.967848,0.582151,0.967647,1


# Part 2 - Optimizing Agent Incentives 

In [124]:
from math import exp
import numpy as np
def total_revenue(incentive_array, predicted_df=df_out):
    delta_p_array = [0.2*(1-exp(-(10*(1-exp(-incentive/400)))/5)) for incentive in incentive_array]
    predicted_df['incentive'] = incentive_array
    predicted_df['delta_p'] = delta_p_array
    predicted_df['revenue'] = predicted_df.apply(lambda row: row['guess']*(row['incentive'] - (row['delta_p']*row['premium']))
                                                 , axis=1)
    return predicted_df['revenue'].sum(), np.asarray(predicted_df['revenue'].tolist())

In [125]:
def positive_numbers(**kwargs):
    x = kwargs['x_new']
    tmax = True
    tmin = bool(np.all(x >= 0))
    return tmax and tmin

In [127]:
incentive_list = pd.read_csv("/home/sanjay/Downloads/chk/submission_baseline_sat_1pm.csv")['incentives'].tolist()
guess_list = df_out['guess'].tolist()
incentive_array = np.asarray([a*b for a,b in zip(incentive_list,guess_list)])

In [128]:
from scipy.optimize import basinhopping
minimizer_kwargs = {"method":"L-BFGS-B", "jac":True}
ret = basinhopping(total_revenue, incentive_array, minimizer_kwargs=minimizer_kwargs, stepsize=100,
                   niter=200, accept_test=positive_numbers,seed=123)

assured_revenue = df_out.apply(lambda row: (row['renewal']*row['premium']), axis=1).sum()
print("global minimum: {}".format(ret.fun-assured_revenue))

global minimum: -356061831.141


In [129]:
assured_revenue

347623000.1753256

In [130]:
df_out['incentives'] = ret.x

In [131]:
df_out['id'] = test['id']

In [132]:
final_submission = df_out[['id','renewal','incentives']]

In [133]:
final_submission.head()

Unnamed: 0,id,renewal,incentives
0,649,0.98569,1527.42893
1,81136,0.978429,2163.968908
2,70762,0.728201,0.0
3,53935,0.967882,1612.129802
4,15476,0.967647,1232.456119


In [134]:
final_submission.to_csv('/home/sanjay/Downloads/chk/submission_baseline_sat_3pm.csv', index= False)