In [379]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)


In [380]:
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')
target = pd.read_csv('training_set_labels.csv')

train.drop('Unnamed: 0',inplace=True,axis=1)
test.drop('Unnamed: 0',inplace=True,axis=1)

#total = pd.concat([train,test])
total = train.append(test, ignore_index = True)

In [381]:
target.head(1)

Unnamed: 0,id,status_group
0,69572,functional


In [382]:
train = train.drop(['id'], axis = 1) #Dropping the id from the train dataset as it will not help in any way in predicting the target
test_ids = test['id'] #Keeping the test ids in a separate file for final prediction
test = test.drop(['id'], axis = 1) #Again dropping the ids from the test dataset as they are not going to help in final prediction
target = target.drop(['id'], axis = 1) #Similarly dropping the ids from the train labels dataset
total = total.drop(['id'], axis = 1)

In [383]:
target['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [384]:
proportion = (target['status_group'].value_counts()/target['status_group'].value_counts().sum())
proportion

functional                 0.543081
non functional             0.384242
functional needs repair    0.072677
Name: status_group, dtype: float64

In [385]:
total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74250 entries, 0 to 74249
Data columns (total 29 columns):
amount_tsh           74250 non-null float64
funder               69746 non-null object
gps_height           74250 non-null float64
installer            69718 non-null object
longitude            74250 non-null float64
latitude             74250 non-null float64
basin                74250 non-null object
subvillage           73780 non-null object
region               74250 non-null object
region_code          74250 non-null int64
district_code        74250 non-null int64
lga                  74250 non-null object
ward                 74250 non-null object
population           74250 non-null float64
public_meeting       70095 non-null object
scheme_management    69404 non-null object
scheme_name          38992 non-null object
permit               70457 non-null object
construction_year    74250 non-null float64
extraction_type      74250 non-null object
management           74250 

In [386]:
total.columns

Index(['amount_tsh', 'funder', 'gps_height', 'installer', 'longitude',
       'latitude', 'basin', 'subvillage', 'region', 'region_code',
       'district_code', 'lga', 'ward', 'population', 'public_meeting',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'management', 'management_group', 'payment',
       'payment_type', 'water_quality', 'quantity', 'source',
       'waterpoint_type', 'operation_years'],
      dtype='object')

In [387]:
total = total.drop(['funder','payment_type','scheme_name','subvillage','region','region_code','district_code','lga','ward',
              'public_meeting','water_quality'], axis = 1)

In [388]:
total['population'] = total['population'].fillna(total['population'].median())
if total['construction_year'].value_counts().index[0]==0:
    total['construction_year'] = total['construction_year'].fillna(total['construction_year'].value_counts().index[1])
else:
    total['construction_year'] = total['construction_year'].fillna(total['construction_year'].value_counts().index[0])

In [389]:
cols = ['installer','scheme_management','permit']
for col in cols:
    if total[col].value_counts().index[0]==0:
        total[col] = total[col].fillna(total[col].value_counts().index[1])
    else:
        total[col] = total[col].fillna(total[col].value_counts().index[0])

In [390]:
total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74250 entries, 0 to 74249
Data columns (total 18 columns):
amount_tsh           74250 non-null float64
gps_height           74250 non-null float64
installer            74250 non-null object
longitude            74250 non-null float64
latitude             74250 non-null float64
basin                74250 non-null object
population           74250 non-null float64
scheme_management    74250 non-null object
permit               74250 non-null bool
construction_year    74250 non-null float64
extraction_type      74250 non-null object
management           74250 non-null object
management_group     74250 non-null object
payment              74250 non-null object
quantity             74250 non-null object
source               74250 non-null object
waterpoint_type      74250 non-null object
operation_years      74250 non-null int64
dtypes: bool(1), float64(6), int64(1), object(10)
memory usage: 9.7+ MB


In [391]:
#divide data again into train and test

train = total.iloc[:59400,:]
test = total.iloc[59400:,:]

In [392]:
#installer dataframe & dummified target
train_installer_target = pd.concat([train['installer'],target], axis=1)
target_dummies = pd.get_dummies(train_installer_target['status_group'])
train_installer_target = train_installer_target.drop(['status_group'], axis = 1)
train_installer_target = pd.concat([train_installer_target,target_dummies], axis=1)
train_installer_target.head(2)

Unnamed: 0,installer,functional,functional needs repair,non functional
0,Roman,1,0,0
1,GRUMETI,1,0,0


In [393]:
train_installer_target_prop = pd.concat([train_installer_target.groupby('installer').mean(),train_installer_target.groupby('installer').count()], axis = 1).iloc[:,:-2]
train_installer_target_prop.columns = ['functional','functional needs repair','non functional','count']
train_installer_target_prop.sort_values(by='count',ascending=False).head()

Unnamed: 0_level_0,functional,functional needs repair,non functional,count
installer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DWE,0.542955,0.097925,0.35912,21057
Government,0.293151,0.140274,0.566575,1825
RWE,0.252073,0.113599,0.634328,1206
Commu,0.683019,0.030189,0.286792,1060
DANIDA,0.51619,0.079048,0.404762,1050


In [394]:
train_installer_target_prop['category'] = np.zeros((train_installer_target_prop.shape[0],1))
train_installer_target_prop['category'].loc[train_installer_target_prop['functional'] > 0.54] = 'functional'
train_installer_target_prop['category'].loc[train_installer_target_prop['non functional'] > 0.38] = 'non functional'
train_installer_target_prop['category'].loc[train_installer_target_prop['functional needs repair'] > 0.05] = 'functional needs repair'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [395]:
train['installer_category'] = train.installer.map(train_installer_target_prop.category)
test['installer_category'] = test.installer.map(train_installer_target_prop.category)
test['installer_category'] = test['installer_category'].fillna(test['installer_category'].value_counts().index[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [396]:
total = train.append(test, ignore_index = True)

In [397]:
total = total.drop(['installer'], axis=1)

In [398]:
total.describe(include = ['O'])

Unnamed: 0,basin,scheme_management,extraction_type,management,management_group,payment,quantity,source,waterpoint_type,installer_category
count,74250,74250,74250,74250,74250,74250,74250,74250,74250,74250
unique,9,12,18,12,5,7,5,10,7,3
top,Lake Victoria,VWC,gravity,vwc,user-group,never pay,enough,spring,communal standpipe,functional needs repair
freq,12871,50763,33263,50624,65538,31712,41522,21216,35628,47078


In [399]:
total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74250 entries, 0 to 74249
Data columns (total 18 columns):
amount_tsh            74250 non-null float64
gps_height            74250 non-null float64
longitude             74250 non-null float64
latitude              74250 non-null float64
basin                 74250 non-null object
population            74250 non-null float64
scheme_management     74250 non-null object
permit                74250 non-null bool
construction_year     74250 non-null float64
extraction_type       74250 non-null object
management            74250 non-null object
management_group      74250 non-null object
payment               74250 non-null object
quantity              74250 non-null object
source                74250 non-null object
waterpoint_type       74250 non-null object
operation_years       74250 non-null int64
installer_category    74250 non-null object
dtypes: bool(1), float64(6), int64(1), object(10)
memory usage: 9.7+ MB


In [400]:
total_dummies = pd.get_dummies(total)

In [401]:
total_dummies.describe()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,population,construction_year,operation_years,basin_Internal,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,scheme_management_Company,scheme_management_None,scheme_management_Other,scheme_management_Parastatal,scheme_management_Private operator,scheme_management_SWC,scheme_management_Trust,scheme_management_VWC,scheme_management_WUA,scheme_management_WUG,scheme_management_Water Board,scheme_management_Water authority,extraction_type_afridev,extraction_type_cemo,extraction_type_climax,extraction_type_gravity,extraction_type_india mark ii,extraction_type_india mark iii,extraction_type_ksb,extraction_type_mono,extraction_type_nira/tanira,extraction_type_other,extraction_type_other - mkulima/shinyanga,extraction_type_other - play pump,extraction_type_other - rope pump,extraction_type_other - swn 81,extraction_type_submersible,extraction_type_swn 80,extraction_type_walimi,extraction_type_windmill,management_company,management_other,management_other - school,management_parastatal,management_private operator,management_trust,management_unknown,management_vwc,management_water authority,management_water board,management_wua,management_wug,management_group_commercial,management_group_other,management_group_parastatal,management_group_unknown,management_group_user-group,payment_never pay,payment_other,payment_pay annually,payment_pay monthly,payment_pay per bucket,payment_pay when scheme fails,payment_unknown,quantity_dry,quantity_enough,quantity_insufficient,quantity_seasonal,quantity_unknown,source_dam,source_hand dtw,source_lake,source_machine dbh,source_other,source_rainwater harvesting,source_river,source_shallow well,source_spring,source_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,installer_category_functional,installer_category_functional needs repair,installer_category_non functional
count,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0,74250.0
mean,1241.682018,1078.416653,35.091203,-5.79547,288.755872,1997.635111,14.140929,0.129859,0.085279,0.041199,0.108444,0.173347,0.150074,0.134505,0.075246,0.102047,0.018061,1.3e-05,0.013414,0.028606,0.017859,0.001657,0.001239,0.683677,0.047825,0.087488,0.046626,0.053535,0.029737,0.001455,0.000552,0.447987,0.040795,0.001818,0.024108,0.048862,0.137441,0.109118,2.7e-05,0.00136,0.007704,0.003825,0.080566,0.061791,0.000808,0.002047,0.011569,0.014586,0.001697,0.03002,0.033724,0.001414,0.009199,0.681805,0.015125,0.04967,0.041993,0.109199,0.061832,0.016283,0.03002,0.009199,0.882667,0.427098,0.017697,0.061549,0.140027,0.151731,0.065212,0.136687,0.104808,0.559219,0.254492,0.06835,0.013131,0.011313,0.014923,0.012795,0.186155,0.003515,0.038559,0.161131,0.284714,0.285737,0.001158,0.00202,0.479838,0.102505,0.000108,0.294734,0.012916,0.107879,0.184795,0.634047,0.181158
std,3508.83224,513.919946,2.58926,2.8085,466.182953,10.178555,10.278036,0.33615,0.279299,0.198751,0.310943,0.378549,0.357146,0.341196,0.263789,0.302712,0.133172,0.00367,0.115041,0.166698,0.132438,0.040667,0.035179,0.465044,0.213397,0.282551,0.210838,0.2251,0.169863,0.038111,0.023492,0.497291,0.197815,0.042602,0.153385,0.215581,0.344315,0.311789,0.00519,0.036857,0.087433,0.061728,0.272169,0.240778,0.028415,0.045199,0.106936,0.119889,0.04116,0.170644,0.180519,0.037579,0.095468,0.465779,0.122049,0.217264,0.200575,0.31189,0.240851,0.126562,0.170644,0.095468,0.321819,0.49466,0.131848,0.240336,0.347017,0.358762,0.246901,0.343519,0.306308,0.496484,0.435578,0.252348,0.113838,0.105761,0.121244,0.112388,0.389234,0.059185,0.192543,0.367655,0.451281,0.451768,0.034014,0.044902,0.499597,0.303313,0.010379,0.455926,0.112912,0.310229,0.388133,0.4817,0.385152
min,0.2,-90.0,29.607122,-11.64944,1.0,1960.0,-7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,200.0,833.0,33.23447,-8.525675,80.0,1995.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1000.0,1192.0,34.907475,-5.02654,200.0,1999.5,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,1241.682018,1337.0,37.181685,-3.352929,400.0,2004.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
max,350000.0,2777.0,40.345193,-0.998464,30500.0,2013.0,53.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [402]:
total_dummies = total_dummies.drop(['construction_year'], axis=1)
total_dummies.shape

(74250, 95)

In [403]:
train = total_dummies.iloc[:59400,:]
test = total_dummies.iloc[59400:,:]

In [404]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [405]:
classifier = RandomForestClassifier(n_estimators=500, random_state = 8)


In [406]:
score = cross_val_score(classifier, train, np.ravel(target), scoring = 'accuracy',cv=5).mean()
print("Score: {}".format(score))

Score: 0.804158240399234


In [407]:

total_dummies.head()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,population,permit,operation_years,basin_Internal,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,scheme_management_Company,scheme_management_None,scheme_management_Other,scheme_management_Parastatal,scheme_management_Private operator,scheme_management_SWC,scheme_management_Trust,scheme_management_VWC,scheme_management_WUA,scheme_management_WUG,scheme_management_Water Board,scheme_management_Water authority,extraction_type_afridev,extraction_type_cemo,extraction_type_climax,extraction_type_gravity,extraction_type_india mark ii,extraction_type_india mark iii,extraction_type_ksb,extraction_type_mono,extraction_type_nira/tanira,extraction_type_other,extraction_type_other - mkulima/shinyanga,extraction_type_other - play pump,extraction_type_other - rope pump,extraction_type_other - swn 81,extraction_type_submersible,extraction_type_swn 80,extraction_type_walimi,extraction_type_windmill,management_company,management_other,management_other - school,management_parastatal,management_private operator,management_trust,management_unknown,management_vwc,management_water authority,management_water board,management_wua,management_wug,management_group_commercial,management_group_other,management_group_parastatal,management_group_unknown,management_group_user-group,payment_never pay,payment_other,payment_pay annually,payment_pay monthly,payment_pay per bucket,payment_pay when scheme fails,payment_unknown,quantity_dry,quantity_enough,quantity_insufficient,quantity_seasonal,quantity_unknown,source_dam,source_hand dtw,source_lake,source_machine dbh,source_other,source_rainwater harvesting,source_river,source_shallow well,source_spring,source_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,installer_category_functional,installer_category_functional needs repair,installer_category_non functional
0,6000.0,1390.0,34.938093,-9.856322,109.0,False,12,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0
1,537.5,1399.0,34.698766,-2.147466,280.0,True,3,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0
2,25.0,686.0,37.460664,-3.821329,250.0,True,4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
3,321.428571,263.0,38.486161,-11.155298,58.0,True,27,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
4,1241.682018,1192.0,31.130847,-1.825359,200.0,True,11,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1


In [408]:
model = RandomForestClassifier(n_estimators=500, random_state=8)
model.fit(train,np.ravel(target))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=8, verbose=0, warm_start=False)

In [409]:
test_predictions = model.predict(test)

In [410]:
submission = pd.DataFrame(test_predictions, columns=['status_group'])
submission.insert(0, 'id', test_ids)
submission.reset_index()
submission.to_csv('submission.csv', index = False)