In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('insurance_prediction_training.csv')

In [4]:
df.head()

Unnamed: 0,id,region,village,age,agpop,rice_inc,ricearea_2010,general_trust,educ,educ_good,male,disaster_loss,disaster_yes,risk_averse,literacy,age_missing,agpop_missing,rice_inc_missing,ricearea_2010_missing,disaster_loss_missing,educ_missing,male_missing,literacy_missing,takeup
0,1,1,21,54,2,20,2.4,1,2.0,1,1,0,0,0.4,1,0,0,0,0,1,0,0,0,1
1,2,1,21,73,2,100,2.3,1,1.0,0,1,0,0,0.0,1,0,0,0,0,1,0,0,0,1
2,3,1,21,72,10,80,12.0,1,1.0,0,1,0,0,0.0,1,0,0,0,0,1,0,0,0,1
3,4,1,21,43,4,20,4.0,1,2.0,1,1,0,1,0.0,1,0,0,0,0,1,0,0,0,0
4,5,1,21,63,6,90,14.0,1,1.0,0,1,0,1,0.0,1,0,0,0,0,1,0,0,0,0


In [5]:
df.isna().sum()

id                        0
region                    0
village                   0
age                       0
agpop                     0
rice_inc                  0
ricearea_2010             0
general_trust             0
educ                     23
educ_good                 0
male                      0
disaster_loss             0
disaster_yes              0
risk_averse               0
literacy                  0
age_missing               0
agpop_missing             0
rice_inc_missing          0
ricearea_2010_missing     0
disaster_loss_missing     0
educ_missing              0
male_missing              0
literacy_missing          0
takeup                    0
dtype: int64

In [6]:
df = df.drop(['educ'], axis = 1) #educ has missing values in test and train dataset

In [7]:
X = df.drop(['takeup'], axis = 1)
y = df.takeup

In [60]:
y.value_counts()

0    2462
1    2440
Name: takeup, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2)

In [61]:
y_train.value_counts()

0    1972
1    1949
Name: takeup, dtype: int64

In [9]:
def one_hot_encode(data, col):
    encoded_data = pd.get_dummies(data, columns=[col])
    return encoded_data

In [10]:
X_train = one_hot_encode(X_train, 'region')

In [11]:
#X_train = one_hot_encode(X_train, 'village')

In [12]:
X_train.head()

Unnamed: 0,id,village,age,agpop,rice_inc,ricearea_2010,general_trust,educ_good,male,disaster_loss,disaster_yes,risk_averse,literacy,age_missing,agpop_missing,rice_inc_missing,ricearea_2010_missing,disaster_loss_missing,educ_missing,male_missing,literacy_missing,region_1,region_2,region_3
1293,1294,7,38,6,100,17.0,1,1,1,0,1,0.0,1,0,0,0,0,1,0,0,0,0,0,1
3234,3235,31,68,6,60,10.0,1,0,1,0,0,0.2,0,0,0,0,0,1,0,0,0,0,1,0
2375,2376,20,63,4,100,13.0,1,0,1,30,1,1.0,1,0,0,0,0,0,0,0,0,1,0,0
2806,2807,20,70,6,70,12.0,1,0,1,50,1,0.0,0,0,0,0,0,0,0,0,0,1,0,0
4203,4204,7,57,4,100,7.0,1,0,1,70,1,0.0,0,0,0,0,0,0,0,0,0,0,0,1


In [13]:
X_train.shape

(3921, 24)

In [14]:
#check if sum is 0 for any column to see if it is redundant
for col in X.columns:
    print(col, sum(X[col]))

id 12017253
region 9025
village 129568
age 248616
agpop 24496
rice_inc 361926
ricearea_2010 63402.9499992967
general_trust 4389
educ_good 1771
male 4287
disaster_loss 88985
disaster_yes 3174
risk_averse 933.199993506074
literacy 3713
age_missing 1
agpop_missing 7
rice_inc_missing 74
ricearea_2010_missing 31
disaster_loss_missing 1912
educ_missing 23
male_missing 2
literacy_missing 23


In [15]:
X_train.isna().sum()

id                       0
village                  0
age                      0
agpop                    0
rice_inc                 0
ricearea_2010            0
general_trust            0
educ_good                0
male                     0
disaster_loss            0
disaster_yes             0
risk_averse              0
literacy                 0
age_missing              0
agpop_missing            0
rice_inc_missing         0
ricearea_2010_missing    0
disaster_loss_missing    0
educ_missing             0
male_missing             0
literacy_missing         0
region_1                 0
region_2                 0
region_3                 0
dtype: int64

In [16]:
test = pd.read_csv('insurance_prediction_to_predict.csv')
print(sum(test.educ_missing))
print(sum(test.literacy_missing))
print(sum(test.age_missing))

152
152
23


In [17]:
X_train = X_train.drop(columns = ['literacy_missing', 'educ_missing', 'age_missing'])

In [18]:
X_train.corr()

Unnamed: 0,id,village,age,agpop,rice_inc,ricearea_2010,general_trust,educ_good,male,disaster_loss,disaster_yes,risk_averse,literacy,agpop_missing,rice_inc_missing,ricearea_2010_missing,disaster_loss_missing,male_missing,region_1,region_2,region_3
id,1.0,0.022227,0.004549,0.003689,0.011718,0.012616,0.005301,0.016522,-0.033755,0.025671,0.019435,0.009036,-0.006194,0.0079,0.003247,0.024112,-0.018293,-0.007202,-0.043692,-0.018858,0.067532
village,0.022227,1.0,0.039547,0.063797,-0.045589,-0.016918,-0.030965,-0.019766,0.14483,-0.08975,-0.052455,0.037531,0.074756,0.009422,-0.008719,-0.001106,0.069656,-0.01133,0.175323,-0.095428,-0.09537
age,0.004549,0.039547,1.0,0.142211,-0.044464,-0.207949,0.054192,-0.322657,0.099042,-0.072829,-0.161088,-0.101309,-0.315175,0.000322,-0.013643,0.002177,0.144017,-0.011138,0.163455,-0.198514,0.023522
agpop,0.003689,0.063797,0.142211,1.0,-0.120155,0.024872,0.013796,-0.017597,0.000977,-0.002711,-0.005468,-0.020819,-0.061956,0.000118,0.022548,0.041104,0.026433,-0.01026,0.070387,-0.073751,-0.001913
rice_inc,0.011718,-0.045589,-0.044464,-0.120155,1.0,0.236306,-0.025109,-0.070619,-0.06177,0.07573,0.182825,-0.048792,-0.091178,0.009266,0.026849,0.047263,-0.156981,0.013125,-0.208456,0.281763,-0.059348
ricearea_2010,0.012616,-0.016918,-0.207949,0.024872,0.236306,1.0,0.022198,0.074205,0.019499,0.003901,0.139682,0.092103,0.101174,-0.006581,-0.008723,-0.012958,-0.048235,0.047934,-0.045665,0.165209,-0.119217
general_trust,0.005301,-0.030965,0.054192,0.013796,-0.025109,0.022198,1.0,-0.010252,-0.06083,0.015354,-0.018448,0.003257,0.012932,0.014569,-0.019527,0.028687,0.012801,-0.065573,-0.092722,0.035934,0.065355
educ_good,0.016522,-0.019766,-0.322657,-0.017597,-0.070619,0.074205,-0.010252,1.0,0.185902,-0.047912,0.016083,0.097223,0.426674,0.043993,0.04942,-0.023635,0.004008,-0.016881,-0.042204,0.063284,-0.018419
male,-0.033755,0.14483,0.099042,0.000977,-0.06177,0.019499,-0.06083,0.185902,1.0,-0.110819,-0.062869,0.053844,0.344685,0.016481,0.010288,-0.031357,0.099325,0.008804,0.281267,-0.233784,-0.07018
disaster_loss,0.025671,-0.08975,-0.072829,-0.002711,0.07573,0.003901,0.015354,-0.047912,-0.110819,1.0,0.637458,-0.084106,-0.077608,0.031707,-0.04703,-0.0377,-0.685436,0.012353,-0.303739,0.115458,0.216406


So educ_missing and literacy_missing have zero values in training data so do we remove them? But 152, nonzero values in test

In [19]:
model = LogisticRegression(random_state=1)

In [28]:
params = {
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    'class_weight': [None, 'balanced'],
    'penalty':['l1', 'l2', 'elasticnet', None],
    'max_iter':[500, 1000, 2000]
}

In [29]:
# Dataset will be divided into 5 folds (or subsets). 
# During cross-validation, the model will be trained and evaluated 5 times
# Data will be randomly shuffled before splitting into folds.
skf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 1001)

In [30]:
random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=375, scoring='neg_log_loss', n_jobs=4, cv=skf.split(X_train,y_train))

In [31]:
random_search.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=1.03498e-21): result may not be accurate.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=1.02633e-21): result may not be accurate.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/m

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=2.11736e-22): result may not be accurate.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=1.01661e-21): result may not be accurate.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/m

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=1.02633e-21): result may not be accurate.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra m

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=2.11736e-22): result may not be accurate.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=1.02877e-21): result may not be accurate.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/m

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=7.0442e-22): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=2.52183e-22): result may not be accurate.
STOP: TOTAL NO. of ITERATION

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=2.52183e-22): result may not be accurate.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra m

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=2.52183e-22): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=7.0442e-22): result may not be accurate.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing th

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=9.79997e-22): result may not be accurate.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimi



In [32]:
best_params = random_search.best_params_
best_params

{'solver': 'newton-cg', 'penalty': None, 'max_iter': 500, 'class_weight': None}

In [33]:
X_test = one_hot_encode(X_test, 'region')
#X_test = one_hot_encode(X_test, 'village')
X_test = X_test.drop(columns = ['literacy_missing', 'educ_missing', 'age_missing'])

In [34]:
y_pred = random_search.predict(X_test)

In [35]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.6034658511722731

In [36]:
log_loss_score = log_loss(y_test, random_search.predict_proba(X_test))
log_loss_score #y_test (20% of training dataset given)

0.6670734496761109

In [37]:
model = LogisticRegression(random_state=1)
model.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [38]:
y_pred = model.predict(X_test)

In [41]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.5667686034658511

In [42]:
log_loss_score = log_loss(y_test, random_search.predict_proba(X_test))
log_loss_score #y_test (20% of training dataset given)

0.6670734496761109

In [43]:
test = pd.read_csv('insurance_prediction_to_predict.csv')

In [44]:
test.isna().sum()

id                         0
region                     0
village                    0
age                        0
agpop                      0
rice_inc                   0
ricearea_2010              0
general_trust              0
educ                     152
educ_good                  0
male                       0
disaster_loss              0
disaster_yes               0
risk_averse                0
literacy                   0
age_missing                0
agpop_missing              0
rice_inc_missing           0
ricearea_2010_missing      0
disaster_loss_missing      0
educ_missing               0
male_missing               0
literacy_missing           0
dtype: int64

In [45]:
test.shape

(9805, 23)

In [46]:
X.educ_missing.sum()

23

In [47]:
test.educ_missing.sum()

152

In [48]:
test = test.drop(columns = ['literacy_missing', 'educ_missing', 'age_missing', 'educ'])
test = one_hot_encode(test, 'region')
#test = one_hot_encode(test, 'village')

In [52]:
results = model.predict_proba(test)

In [53]:
results

array([[0.61093998, 0.38906002],
       [0.56690202, 0.43309798],
       [0.51565936, 0.48434064],
       ...,
       [0.63498353, 0.36501647],
       [0.58841089, 0.41158911],
       [0.57027105, 0.42972895]])

In [54]:
len(results)

9805

In [56]:
model.predict(test)

array([0, 0, 0, ..., 0, 0, 0])

In [58]:
sum(model.predict(test))

1443

In [59]:
results_dict = {
    'id': test.id,
    'takeup': results[:,1]
}

In [126]:
results_df = pd.DataFrame(results_dict)

In [127]:
results_df.to_csv('submission_rf_ohe_region.csv', header = True, index = False)