In [24]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score

In [2]:
df = pd.read_csv('insurance_prediction_training.csv')

In [3]:
df.head()

Unnamed: 0,id,region,village,age,agpop,rice_inc,ricearea_2010,general_trust,educ,educ_good,...,literacy,age_missing,agpop_missing,rice_inc_missing,ricearea_2010_missing,disaster_loss_missing,educ_missing,male_missing,literacy_missing,takeup
0,1,1,21,54,2,20,2.4,1,2.0,1,...,1,0,0,0,0,1,0,0,0,1
1,2,1,21,73,2,100,2.3,1,1.0,0,...,1,0,0,0,0,1,0,0,0,1
2,3,1,21,72,10,80,12.0,1,1.0,0,...,1,0,0,0,0,1,0,0,0,1
3,4,1,21,43,4,20,4.0,1,2.0,1,...,1,0,0,0,0,1,0,0,0,0
4,5,1,21,63,6,90,14.0,1,1.0,0,...,1,0,0,0,0,1,0,0,0,0


In [4]:
X = df.drop(['takeup'], axis = 1)
y = df.takeup

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2)

In [8]:
model = XGBClassifier()

In [9]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [18]:
skf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=50, scoring='neg_log_loss', n_jobs=4, cv=skf.split(X_train,y_train), random_state=1)

In [19]:
random_search.fit(X_train,y_train)

In [21]:
y_pred = random_search.predict(X_test)

In [25]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.872579001019368

In [26]:
log_loss_score = log_loss(y_test, random_search.predict_proba(X_test))
log_loss_score

0.33559961213847134

In [27]:
test = pd.read_csv('insurance_prediction_to_predict.csv')

In [36]:
results = random_search.predict_proba(test)

In [37]:
results

array([[0.3502491 , 0.6497509 ],
       [0.43207365, 0.56792635],
       [0.12269813, 0.8773019 ],
       ...,
       [0.955124  , 0.04487601],
       [0.2607214 , 0.7392786 ],
       [0.76832294, 0.23167703]], dtype=float32)

In [39]:
len(results)

9805

In [40]:
random_search.predict(test)

array([1, 1, 1, ..., 0, 1, 0])

In [41]:
results_dict = {
    'id': test.id,
    'takeup': results[:,1]
}

In [43]:
results_df = pd.DataFrame(results_dict)

In [45]:
results_df.to_csv('submission_xgb_random_search_predprob.csv', header = True, index = False)