In [86]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score

In [87]:
pd.set_option('display.max_columns', None)

In [88]:
df = pd.read_csv('insurance_prediction_training.csv')

In [89]:
df.head()

Unnamed: 0,id,region,village,age,agpop,rice_inc,ricearea_2010,general_trust,educ,educ_good,male,disaster_loss,disaster_yes,risk_averse,literacy,age_missing,agpop_missing,rice_inc_missing,ricearea_2010_missing,disaster_loss_missing,educ_missing,male_missing,literacy_missing,takeup
0,1,1,21,54,2,20,2.4,1,2.0,1,1,0,0,0.4,1,0,0,0,0,1,0,0,0,1
1,2,1,21,73,2,100,2.3,1,1.0,0,1,0,0,0.0,1,0,0,0,0,1,0,0,0,1
2,3,1,21,72,10,80,12.0,1,1.0,0,1,0,0,0.0,1,0,0,0,0,1,0,0,0,1
3,4,1,21,43,4,20,4.0,1,2.0,1,1,0,1,0.0,1,0,0,0,0,1,0,0,0,0
4,5,1,21,63,6,90,14.0,1,1.0,0,1,0,1,0.0,1,0,0,0,0,1,0,0,0,0


In [90]:
df.isna().sum()

id                        0
region                    0
village                   0
age                       0
agpop                     0
rice_inc                  0
ricearea_2010             0
general_trust             0
educ                     23
educ_good                 0
male                      0
disaster_loss             0
disaster_yes              0
risk_averse               0
literacy                  0
age_missing               0
agpop_missing             0
rice_inc_missing          0
ricearea_2010_missing     0
disaster_loss_missing     0
educ_missing              0
male_missing              0
literacy_missing          0
takeup                    0
dtype: int64

In [91]:
df = df.drop(['educ'], axis = 1) #educ has missing values in test and train dataset

In [92]:
X = df.drop(['takeup'], axis = 1)
y = df.takeup

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2)

In [94]:
def one_hot_encode(data, col):
    encoded_data = pd.get_dummies(data, columns=[col])
    return encoded_data

In [95]:
X_train = one_hot_encode(X_train, 'region')

In [96]:
#X_train = one_hot_encode(X_train, 'village')

In [97]:
X_train.head()

Unnamed: 0,id,village,age,agpop,rice_inc,ricearea_2010,general_trust,educ_good,male,disaster_loss,disaster_yes,risk_averse,literacy,age_missing,agpop_missing,rice_inc_missing,ricearea_2010_missing,disaster_loss_missing,educ_missing,male_missing,literacy_missing,region_1,region_2,region_3
2601,2602,13,39,6,100,35.0,1,1,1,5,1,1.0,1,0,0,0,0,0,0,0,0,0,1,0
413,414,20,51,6,40,20.0,1,1,1,30,1,0.0,1,0,0,0,0,0,0,0,0,1,0,0
4552,4553,45,64,9,30,6.0,1,0,1,40,1,0.0,1,0,0,0,0,0,0,0,0,0,0,1
2271,2272,13,46,5,90,15.0,1,1,1,20,1,0.0,1,0,0,0,0,0,0,0,0,0,1,0
3826,3827,24,44,3,100,23.0,1,0,1,20,1,0.0,1,0,0,0,0,0,0,0,0,0,1,0


In [98]:
X_train.shape

(3921, 24)

In [99]:
#check if sum is 0 for any column to see if it is redundant
for col in X.columns:
    print(col, sum(X[col]))

id 12017253
region 9025
village 129568
age 248616
agpop 24496
rice_inc 361926
ricearea_2010 63402.9499992967
general_trust 4389
educ_good 1771
male 4287
disaster_loss 88985
disaster_yes 3174
risk_averse 933.199993506074
literacy 3713
age_missing 1
agpop_missing 7
rice_inc_missing 74
ricearea_2010_missing 31
disaster_loss_missing 1912
educ_missing 23
male_missing 2
literacy_missing 23


In [100]:
X_train.isna().sum()

id                       0
village                  0
age                      0
agpop                    0
rice_inc                 0
ricearea_2010            0
general_trust            0
educ_good                0
male                     0
disaster_loss            0
disaster_yes             0
risk_averse              0
literacy                 0
age_missing              0
agpop_missing            0
rice_inc_missing         0
ricearea_2010_missing    0
disaster_loss_missing    0
educ_missing             0
male_missing             0
literacy_missing         0
region_1                 0
region_2                 0
region_3                 0
dtype: int64

In [101]:
test = pd.read_csv('insurance_prediction_to_predict.csv')
print(sum(test.educ_missing))
print(sum(test.literacy_missing))
print(sum(test.age_missing))

152
152
23


In [102]:
X_train = X_train.drop(columns = ['literacy_missing', 'educ_missing', 'age_missing'])

In [103]:
X_train.corr()

Unnamed: 0,id,village,age,agpop,rice_inc,ricearea_2010,general_trust,educ_good,male,disaster_loss,disaster_yes,risk_averse,literacy,agpop_missing,rice_inc_missing,ricearea_2010_missing,disaster_loss_missing,male_missing,region_1,region_2,region_3
id,1.0,0.020605,-0.003531,-0.007676,0.015015,-0.009694,0.002385,0.015196,-0.018433,0.030612,0.024497,0.010882,0.000331,0.004564,0.000908,0.0262,-0.03249,-0.022563,-0.056831,0.00734,0.056108
village,0.020605,1.0,0.044163,0.068265,-0.045907,-0.01539,-0.037008,-0.021708,0.142227,-0.095572,-0.062964,0.038295,0.053922,0.018367,-0.024158,0.012938,0.081621,-0.00813,0.168275,-0.108856,-0.075286
age,-0.003531,0.044163,1.0,0.132966,-0.042628,-0.190446,0.064488,-0.313845,0.10866,-0.069154,-0.159506,-0.109604,-0.311413,0.000645,-0.024529,-0.003166,0.142284,-0.007877,0.168903,-0.201003,0.020099
agpop,-0.007676,0.068265,0.132966,1.0,-0.129337,0.02179,0.000993,0.002203,-0.00102,-0.016438,-0.020678,-0.007205,-0.046577,-0.000339,0.018229,0.031912,0.043607,-0.007585,0.084994,-0.081854,-0.010005
rice_inc,0.015015,-0.045907,-0.042628,-0.129337,1.0,0.218078,-0.024641,-0.080212,-0.072826,0.091148,0.19038,-0.046713,-0.087315,0.008025,0.02654,0.041334,-0.168538,0.009346,-0.211652,0.274727,-0.049014
ricearea_2010,-0.009694,-0.01539,-0.190446,0.02179,0.218078,1.0,0.022816,0.050419,0.013385,0.008026,0.125044,0.083572,0.08792,-0.00507,-0.010819,-0.010899,-0.044814,0.029308,-0.031837,0.126254,-0.095935
general_trust,0.002385,-0.037008,0.064488,0.000993,-0.024641,0.022816,1.0,5e-06,-0.057851,0.001773,-0.031317,0.014148,0.017946,0.012393,-0.022634,0.026641,0.029979,-0.046052,-0.069476,0.02005,0.057042
educ_good,0.015196,-0.021708,-0.313845,0.002203,-0.080212,0.050419,5e-06,1.0,0.177901,-0.0322,0.034794,0.115795,0.429044,0.047291,0.051752,-0.009435,-0.016422,-0.012068,-0.052972,0.063448,-0.00673
male,-0.018433,0.142227,0.10866,-0.00102,-0.072826,0.013385,-0.057851,0.177901,1.0,-0.119078,-0.072533,0.054466,0.331751,0.013614,0.005556,-0.010888,0.099954,0.006085,0.270869,-0.224303,-0.070006
disaster_loss,0.030612,-0.095572,-0.069154,-0.016438,0.091148,0.008026,0.001773,-0.0322,-0.119078,1.0,0.644051,-0.079023,-0.083805,0.038002,-0.041031,-0.031025,-0.697214,0.009286,-0.317961,0.135715,0.215218


So educ_missing and literacy_missing have zero values in training data so do we remove them? But 152, nonzero values in test

In [104]:
model = RandomForestClassifier(random_state=1)

In [105]:
params = {
    'n_estimators': [150, 175, 200, 225, 250],
    'max_depth': [35, 40, 45, 50, 55],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4],
}

In [106]:
# Dataset will be divided into 5 folds (or subsets). 
# During cross-validation, the model will be trained and evaluated 5 times
# Data will be randomly shuffled before splitting into folds.
skf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 1001)

In [107]:
random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=500, scoring='neg_log_loss', n_jobs=4, cv=skf.split(X_train,y_train))

In [108]:
random_search.fit(X_train,y_train)



In [109]:
best_params = random_search.best_params_
best_params

{'n_estimators': 150,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_depth': 35}

In [110]:
X_test = one_hot_encode(X_test, 'region')
#X_test = one_hot_encode(X_test, 'village')
X_test = X_test.drop(columns = ['literacy_missing', 'educ_missing', 'age_missing'])

In [112]:
y_pred = random_search.predict(X_test)

In [113]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9520897043832823

In [114]:
log_loss_score = log_loss(y_test, random_search.predict_proba(X_test))
log_loss_score #y_test (20% of training dataset given)

0.21676389390801826

In [115]:
test = pd.read_csv('insurance_prediction_to_predict.csv')

In [116]:
test.isna().sum()

id                         0
region                     0
village                    0
age                        0
agpop                      0
rice_inc                   0
ricearea_2010              0
general_trust              0
educ                     152
educ_good                  0
male                       0
disaster_loss              0
disaster_yes               0
risk_averse                0
literacy                   0
age_missing                0
agpop_missing              0
rice_inc_missing           0
ricearea_2010_missing      0
disaster_loss_missing      0
educ_missing               0
male_missing               0
literacy_missing           0
dtype: int64

In [117]:
test.shape

(9805, 23)

In [118]:
X.educ_missing.sum()

23

In [119]:
test.educ_missing.sum()

152

In [120]:
test = test.drop(columns = ['literacy_missing', 'educ_missing', 'age_missing', 'educ'])
test = one_hot_encode(test, 'region')
#test = one_hot_encode(test, 'village')

In [121]:
results = random_search.predict_proba(test)

In [122]:
results

array([[0.45333333, 0.54666667],
       [0.56      , 0.44      ],
       [0.32      , 0.68      ],
       ...,
       [0.68666667, 0.31333333],
       [0.33333333, 0.66666667],
       [0.60666667, 0.39333333]])

In [123]:
len(results)

9805

In [124]:
random_search.predict(test)

array([1, 0, 1, ..., 0, 1, 0])

In [125]:
results_dict = {
    'id': test.id,
    'takeup': results[:,1]
}

In [126]:
results_df = pd.DataFrame(results_dict)

In [127]:
results_df.to_csv('submission_rf_ohe_region.csv', header = True, index = False)