In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('insurance_prediction_training.csv')

In [4]:
df.head()

Unnamed: 0,id,region,village,age,agpop,rice_inc,ricearea_2010,general_trust,educ,educ_good,male,disaster_loss,disaster_yes,risk_averse,literacy,age_missing,agpop_missing,rice_inc_missing,ricearea_2010_missing,disaster_loss_missing,educ_missing,male_missing,literacy_missing,takeup
0,1,1,21,54,2,20,2.4,1,2.0,1,1,0,0,0.4,1,0,0,0,0,1,0,0,0,1
1,2,1,21,73,2,100,2.3,1,1.0,0,1,0,0,0.0,1,0,0,0,0,1,0,0,0,1
2,3,1,21,72,10,80,12.0,1,1.0,0,1,0,0,0.0,1,0,0,0,0,1,0,0,0,1
3,4,1,21,43,4,20,4.0,1,2.0,1,1,0,1,0.0,1,0,0,0,0,1,0,0,0,0
4,5,1,21,63,6,90,14.0,1,1.0,0,1,0,1,0.0,1,0,0,0,0,1,0,0,0,0


In [5]:
df.isna().sum()

id                        0
region                    0
village                   0
age                       0
agpop                     0
rice_inc                  0
ricearea_2010             0
general_trust             0
educ                     23
educ_good                 0
male                      0
disaster_loss             0
disaster_yes              0
risk_averse               0
literacy                  0
age_missing               0
agpop_missing             0
rice_inc_missing          0
ricearea_2010_missing     0
disaster_loss_missing     0
educ_missing              0
male_missing              0
literacy_missing          0
takeup                    0
dtype: int64

In [6]:
df = df.dropna()

In [7]:
X = df.drop(['takeup'], axis = 1)
y = df.takeup

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2)

In [9]:
def one_hot_encode(data, col):
    encoded_data = pd.get_dummies(data, columns=[col])
    return encoded_data

In [10]:
X_train = one_hot_encode(X_train, 'region')

In [11]:
#X_train = one_hot_encode(X_train, 'village')

In [12]:
X_train.head()

Unnamed: 0,id,village,age,agpop,rice_inc,ricearea_2010,general_trust,educ,educ_good,male,disaster_loss,disaster_yes,risk_averse,literacy,age_missing,agpop_missing,rice_inc_missing,ricearea_2010_missing,disaster_loss_missing,educ_missing,male_missing,literacy_missing,region_1,region_2,region_3
2781,2782,21,41,4,40,7.0,1,2.0,1,1,0,0,0.0,1,0,0,0,0,1,0,0,0,1,0,0
2524,2525,45,35,6,40,5.0,1,2.0,1,1,0,0,1.0,1,0,0,0,0,1,0,0,0,0,0,1
2223,2224,24,54,6,100,15.0,1,0.0,0,1,30,1,0.0,0,0,0,0,0,0,0,0,0,0,1,0
2736,2737,20,40,4,100,22.0,1,1.0,0,1,5,1,0.0,1,0,0,0,0,0,0,0,0,1,0,0
1625,1626,38,38,5,100,4.8,1,2.0,1,1,0,0,0.0,1,0,0,0,0,1,0,0,0,0,0,1


In [13]:
X_train.shape

(3903, 25)

In [14]:
#check if sum is 0 for any column to see if it is redundant
for col in X.columns:
    print(col, sum(X[col]))

id 11963005
region 9000
village 128822
age 247331
agpop 24388
rice_inc 360550
ricearea_2010 63102.949999296696
general_trust 4370
educ 5761.0
educ_good 1771
male 4264
disaster_loss 88685
disaster_yes 3159
risk_averse 925.5999934822321
literacy 3690
age_missing 0
agpop_missing 6
rice_inc_missing 73
ricearea_2010_missing 30
disaster_loss_missing 1904
educ_missing 0
male_missing 2
literacy_missing 0


In [15]:
test = pd.read_csv('insurance_prediction_to_predict.csv')
print(sum(test.educ_missing))
print(sum(test.literacy_missing))

152
152


In [16]:
X_train = X_train.drop(columns = ['literacy_missing', 'educ_missing', 'age_missing'])

In [17]:
X_train.corr()

Unnamed: 0,id,village,age,agpop,rice_inc,ricearea_2010,general_trust,educ,educ_good,male,disaster_loss,disaster_yes,risk_averse,literacy,agpop_missing,rice_inc_missing,ricearea_2010_missing,disaster_loss_missing,male_missing,region_1,region_2,region_3
id,1.0,0.011228,-0.00632,7.1e-05,0.010862,-0.003372,0.024282,0.016183,0.01792,-0.025295,0.025651,0.019978,0.001053,0.003815,0.026374,0.008142,0.014129,-0.028034,-0.007613,-0.042153,-0.003863,0.050976
village,0.011228,1.0,0.0392,0.06308,-0.065339,-0.023621,-0.024569,0.00345,-0.021062,0.135781,-0.085689,-0.062482,0.03805,0.051359,0.016896,-0.012691,0.00721,0.072934,-0.011295,0.164283,-0.100414,-0.078605
age,-0.00632,0.0392,1.0,0.123659,-0.027331,-0.191429,0.051933,-0.34113,-0.320046,0.107932,-0.061797,-0.166481,-0.107621,-0.305862,0.001029,-0.018379,0.006321,0.14256,-0.010747,0.166642,-0.197348,0.019557
agpop,7.1e-05,0.06308,0.123659,1.0,-0.115065,0.018305,0.00106,-0.008838,-0.000862,0.002999,-0.003274,-0.008805,-0.008444,-0.042397,8e-05,0.024836,0.046677,0.020745,-0.010445,0.069952,-0.081041,0.006337
rice_inc,0.010862,-0.065339,-0.027331,-0.115065,1.0,0.216766,-0.013561,-0.13032,-0.083376,-0.083384,0.08991,0.179964,-0.068066,-0.091567,0.007034,0.028131,0.049459,-0.165899,0.013174,-0.209727,0.276966,-0.054343
ricearea_2010,-0.003372,-0.023621,-0.191429,0.018305,0.216766,1.0,0.024066,0.066268,0.05408,0.017173,0.016307,0.1293,0.088081,0.093393,-0.004581,-0.012429,-0.012159,-0.049694,0.041248,-0.038524,0.134432,-0.096862
general_trust,0.024282,-0.024569,0.051933,0.00106,-0.013561,0.024066,1.0,0.026706,0.007548,-0.060959,0.011905,-0.022491,0.016051,0.022229,0.011078,-0.001771,0.0294,0.012865,-0.065467,-0.077687,0.014378,0.071593
educ,0.016183,0.00345,-0.34113,-0.008838,-0.13032,0.066268,0.026706,1.0,0.850171,0.286626,-0.061822,0.009535,0.125468,0.761483,0.066504,0.066687,-0.044928,0.009196,-0.004595,0.016235,0.025749,-0.044859
educ_good,0.01792,-0.021062,-0.320046,-0.000862,-0.083376,0.05408,0.007548,0.850171,1.0,0.184681,-0.028445,0.025785,0.103665,0.423721,0.042875,0.069688,-0.025512,-0.010269,-0.016915,-0.0513,0.061281,-0.00657
male,-0.025295,0.135781,0.107932,0.002999,-0.083384,0.017173,-0.060959,0.286626,0.184681,1.0,-0.109665,-0.06445,0.057243,0.331074,0.012136,0.017852,-0.031929,0.092188,0.008579,0.27017,-0.225662,-0.066335


So educ_missing and literacy_missing have zero values in training data so do we remove them? But 152, nonzero values in test

In [18]:
model = XGBClassifier(random_state=1)

In [19]:
params = {
    'learning_rate': [0, 0.02, 0.05, 0.07, 0.1],
    'n_estimators': [150, 175, 200, 225, 250],
    'gamma': [0.4, 0.5, 0.6, 0.7],
    'subsample': [0.9, 0.925, 0.95, 0.975],
    'colsample_bytree': [0.9, 0.925, 0.95, 0.975],
    'max_depth': [35, 40, 45, 50, 55]
}

In [20]:
# Dataset will be divided into 5 folds (or subsets). 
# During cross-validation, the model will be trained and evaluated 5 times
# Data will be randomly shuffled before splitting into folds.
skf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 1001)

In [21]:
random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=500, scoring='neg_log_loss', n_jobs=4, cv=skf.split(X_train,y_train))

In [None]:
random_search.fit(X_train,y_train)

In [None]:
best_params = random_search.best_params_
best_params

In [None]:
X_test = one_hot_encode(X_test, 'region')
#X_test = one_hot_encode(X_test, 'village')
X_test = X_test.drop(columns = ['literacy_missing', 'educ_missing', 'age_missing'])

In [None]:
y_pred = random_search.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
log_loss_score = log_loss(y_test, random_search.predict_proba(X_test))
log_loss_score #y_test (20% of training dataset given)

In [None]:
test = pd.read_csv('insurance_prediction_to_predict.csv')

In [None]:
X.educ_missing.sum()

In [None]:
test.educ_missing.sum()

In [None]:
test = one_hot_encode(test, 'region')
#test = one_hot_encode(test, 'village')
test = test.drop(columns = ['literacy_missing', 'educ_missing', 'age_missing'])

In [None]:
results = random_search.predict_proba(test)

In [None]:
results

In [None]:
len(results)

In [None]:
random_search.predict(test)

In [None]:
results_dict = {
    'id': test.id,
    'takeup': results[:,1]
}

In [None]:
results_df = pd.DataFrame(results_dict)

In [None]:
results_df.to_csv('submission_xgb_ohe_region.csv', header = True, index = False)