In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('insurance_prediction_training.csv')

In [4]:
df.head()

Unnamed: 0,id,region,village,age,agpop,rice_inc,ricearea_2010,general_trust,educ,educ_good,male,disaster_loss,disaster_yes,risk_averse,literacy,age_missing,agpop_missing,rice_inc_missing,ricearea_2010_missing,disaster_loss_missing,educ_missing,male_missing,literacy_missing,takeup
0,1,1,21,54,2,20,2.4,1,2.0,1,1,0,0,0.4,1,0,0,0,0,1,0,0,0,1
1,2,1,21,73,2,100,2.3,1,1.0,0,1,0,0,0.0,1,0,0,0,0,1,0,0,0,1
2,3,1,21,72,10,80,12.0,1,1.0,0,1,0,0,0.0,1,0,0,0,0,1,0,0,0,1
3,4,1,21,43,4,20,4.0,1,2.0,1,1,0,1,0.0,1,0,0,0,0,1,0,0,0,0
4,5,1,21,63,6,90,14.0,1,1.0,0,1,0,1,0.0,1,0,0,0,0,1,0,0,0,0


In [5]:
df.isna().sum()

id                        0
region                    0
village                   0
age                       0
agpop                     0
rice_inc                  0
ricearea_2010             0
general_trust             0
educ                     23
educ_good                 0
male                      0
disaster_loss             0
disaster_yes              0
risk_averse               0
literacy                  0
age_missing               0
agpop_missing             0
rice_inc_missing          0
ricearea_2010_missing     0
disaster_loss_missing     0
educ_missing              0
male_missing              0
literacy_missing          0
takeup                    0
dtype: int64

In [6]:
df = df.dropna()

In [7]:
X = df.drop(['takeup'], axis = 1)
y = df.takeup

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2)

In [9]:
def one_hot_encode(data, col):
    encoded_data = pd.get_dummies(data, columns=[col])
    return encoded_data

In [10]:
X_train = one_hot_encode(X_train, 'region')

In [11]:
#X_train = one_hot_encode(X_train, 'village')

In [12]:
X_train.head()

Unnamed: 0,id,village,age,agpop,rice_inc,ricearea_2010,general_trust,educ,educ_good,male,disaster_loss,disaster_yes,risk_averse,literacy,age_missing,agpop_missing,rice_inc_missing,ricearea_2010_missing,disaster_loss_missing,educ_missing,male_missing,literacy_missing,region_1,region_2,region_3
182,183,36,59,12,50,15.0,1,1.0,0,1,0,0,0.0,1,0,0,0,0,1,0,0,0,1,0,0
3063,3064,24,53,4,100,8.0,1,0.0,0,1,25,1,0.0,0,0,0,0,0,0,0,0,0,0,1,0
1210,1211,45,48,4,95,7.5,1,0.0,0,1,20,1,0.0,0,0,0,0,0,0,0,0,0,0,0,1
4213,4214,21,43,4,30,7.0,1,2.0,1,1,10,1,0.2,1,0,0,0,0,0,0,0,0,1,0,0
2180,2181,36,54,7,60,7.0,1,2.0,1,1,10,1,0.0,1,0,0,0,0,0,0,0,0,1,0,0


In [13]:
X_train.shape

(3903, 25)

In [14]:
#check if sum is 0 for any column to see if it is redundant
for col in X.columns:
    print(col, sum(X[col]))

id 11963005
region 9000
village 128822
age 247331
agpop 24388
rice_inc 360550
ricearea_2010 63102.949999296696
general_trust 4370
educ 5761.0
educ_good 1771
male 4264
disaster_loss 88685
disaster_yes 3159
risk_averse 925.5999934822321
literacy 3690
age_missing 0
agpop_missing 6
rice_inc_missing 73
ricearea_2010_missing 30
disaster_loss_missing 1904
educ_missing 0
male_missing 2
literacy_missing 0


In [15]:
test = pd.read_csv('insurance_prediction_to_predict.csv')
print(sum(test.educ_missing))
print(sum(test.literacy_missing))

152
152


In [16]:
X_train = X_train.drop(columns = ['literacy_missing', 'educ_missing', 'age_missing'])

In [17]:
X_train.corr()

Unnamed: 0,id,village,age,agpop,rice_inc,ricearea_2010,general_trust,educ,educ_good,male,disaster_loss,disaster_yes,risk_averse,literacy,agpop_missing,rice_inc_missing,ricearea_2010_missing,disaster_loss_missing,male_missing,region_1,region_2,region_3
id,1.0,0.013991,-0.00374,-0.000379,0.004391,0.003333,0.018428,0.017973,0.010439,-0.011302,0.017211,0.017141,0.018139,0.016598,0.012862,-0.014995,0.022958,-0.015827,-0.007548,-0.021505,-0.021814,0.046244
village,0.013991,1.0,0.032818,0.067892,-0.051643,-0.011235,-0.029315,0.006379,-0.018612,0.139037,-0.079196,-0.04034,0.03886,0.061253,0.020127,-0.000506,0.022901,0.06048,-0.01146,0.163308,-0.094051,-0.084627
age,-0.00374,0.032818,1.0,0.130371,-0.024669,-0.207277,0.057232,-0.335821,-0.313889,0.104065,-0.06549,-0.164779,-0.096539,-0.300077,0.001125,-0.020787,-0.004477,0.148085,-0.010962,0.164413,-0.196191,0.018979
agpop,-0.000379,0.067892,0.130371,1.0,-0.108312,0.022444,0.015105,-0.018694,-0.011183,-0.009484,-0.001041,-0.007261,-0.011613,-0.043441,0.000201,0.02047,0.029667,0.026547,-0.010404,0.081184,-0.082225,-0.005665
rice_inc,0.004391,-0.051643,-0.024669,-0.108312,1.0,0.241327,-0.012352,-0.127463,-0.083144,-0.077444,0.081336,0.180795,-0.051347,-0.08741,0.008313,0.025793,0.041,-0.163892,0.013013,-0.212549,0.277464,-0.048997
ricearea_2010,0.003333,-0.011235,-0.207277,0.022444,0.241327,1.0,0.029604,0.071868,0.065028,0.002698,0.003689,0.142517,0.089335,0.098269,-0.006256,-0.010733,-0.011727,-0.050209,0.04694,-0.055208,0.171728,-0.115013
general_trust,0.018428,-0.029315,0.057232,0.015105,-0.012352,0.029604,1.0,0.016372,0.005384,-0.069433,0.008374,-0.024387,0.005247,0.00742,0.013644,-0.005864,0.025575,0.020431,-0.065118,-0.075264,0.02846,0.054281
educ,0.017973,0.006379,-0.335821,-0.018694,-0.127463,0.071868,0.016372,1.0,0.852099,0.291524,-0.063426,0.013277,0.125316,0.764122,0.080992,0.054404,-0.039506,0.001009,-0.0048,0.020221,0.02154,-0.044539
educ_good,0.010439,-0.018612,-0.313889,-0.011183,-0.083144,0.065028,0.005384,0.852099,1.0,0.186845,-0.036508,0.033072,0.105286,0.430169,0.051629,0.053781,-0.026818,-0.01732,-0.017209,-0.049134,0.05937,-0.00643
male,-0.011302,0.139037,0.104065,-0.009484,-0.077444,0.002698,-0.069433,0.291524,0.186845,1.0,-0.104611,-0.055661,0.046073,0.337521,0.014868,0.02679,-0.014414,0.082377,0.008579,0.269497,-0.227426,-0.065532


So educ_missing and literacy_missing have zero values in training data so do we remove them? But 152, nonzero values in test

In [18]:
model = RandomForestClassifier(random_state=1)

In [23]:
params = {
    'n_estimators': [150, 175, 200, 225, 250],
    'max_depth': [35, 40, 45, 50, 55],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt'],
}

In [24]:
# Dataset will be divided into 5 folds (or subsets). 
# During cross-validation, the model will be trained and evaluated 5 times
# Data will be randomly shuffled before splitting into folds.
skf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 1001)

In [25]:
random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=500, scoring='neg_log_loss', n_jobs=4, cv=skf.split(X_train,y_train))

In [None]:
random_search.fit(X_train,y_train)

In [None]:
best_params = random_search.best_params_
best_params

In [24]:
X_test = one_hot_encode(X_test, 'region')
#X_test = one_hot_encode(X_test, 'village')
X_test = X_test.drop(columns = ['literacy_missing', 'educ_missing', 'age_missing'])

In [25]:
y_pred = random_search.predict(X_test)

In [26]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9590163934426229

In [27]:
log_loss_score = log_loss(y_test, random_search.predict_proba(X_test))
log_loss_score #y_test (20% of training dataset given)

0.1760103079070653

In [28]:
test = pd.read_csv('insurance_prediction_to_predict.csv')

In [29]:
X.educ_missing.sum()

0

In [30]:
test.educ_missing.sum()

152

In [31]:
test = one_hot_encode(test, 'region')
#test = one_hot_encode(test, 'village')
test = test.drop(columns = ['literacy_missing', 'educ_missing', 'age_missing'])

In [32]:
results = random_search.predict_proba(test)

In [33]:
results

array([[0.24694234, 0.75305766],
       [0.7792876 , 0.22071244],
       [0.12815303, 0.871847  ],
       ...,
       [0.8592297 , 0.14077035],
       [0.21370673, 0.78629327],
       [0.5662288 , 0.4337712 ]], dtype=float32)

In [34]:
len(results)

9805

In [35]:
random_search.predict(test)

array([1, 0, 1, ..., 0, 1, 0])

In [36]:
results_dict = {
    'id': test.id,
    'takeup': results[:,1]
}

In [37]:
results_df = pd.DataFrame(results_dict)

In [38]:
results_df.to_csv('submission_xgb_ohe_region.csv', header = True, index = False)