In [1]:
import pandas as pd
import numpy as np

import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('test set values.csv')

In [3]:
data.columns

Index([u'id', u'amount_tsh', u'date_recorded', u'funder', u'gps_height',
       u'installer', u'longitude', u'latitude', u'wpt_name', u'num_private',
       u'basin', u'subvillage', u'region', u'region_code', u'district_code',
       u'lga', u'ward', u'population', u'public_meeting', u'recorded_by',
       u'scheme_management', u'scheme_name', u'permit', u'construction_year',
       u'extraction_type', u'extraction_type_group', u'extraction_type_class',
       u'management', u'management_group', u'payment', u'payment_type',
       u'water_quality', u'quality_group', u'quantity', u'quantity_group',
       u'source', u'source_type', u'source_class', u'waterpoint_type',
       u'waterpoint_type_group'],
      dtype='object')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14850 entries, 0 to 14849
Data columns (total 40 columns):
id                       14850 non-null int64
amount_tsh               14850 non-null float64
date_recorded            14850 non-null object
funder                   13981 non-null object
gps_height               14850 non-null int64
installer                13973 non-null object
longitude                14850 non-null float64
latitude                 14850 non-null float64
wpt_name                 14850 non-null object
num_private              14850 non-null int64
basin                    14850 non-null object
subvillage               14751 non-null object
region                   14850 non-null object
region_code              14850 non-null int64
district_code            14850 non-null int64
lga                      14850 non-null object
ward                     14850 non-null object
population               14850 non-null int64
public_meeting           14029 non-null object
r

In [5]:
data.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other
1,51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,...,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
2,17168,0.0,2013-02-01,,1567,,34.767863,-5.004344,Puma Secondary,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other
3,45559,0.0,2013-01-22,Finn Water,267,FINN WATER,38.058046,-9.418672,Kwa Mzee Pange,0,...,unknown,soft,good,dry,dry,shallow well,shallow well,groundwater,other,other
4,49871,500.0,2013-03-27,Bruder,1260,BRUDER,35.006123,-10.950412,Kwa Mzee Turuka,0,...,monthly,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe


# Feature Transformation/Engineering

# Date Features: date_recorded, construction_year

In [6]:
# Convert dates from strings
data['date_recorded'] = pd.to_datetime(data['date_recorded'])
data['construction_year'] = pd.to_datetime(data['construction_year'])

In [7]:
data['age_of_pump_mnths'] = (data['date_recorded']-data['construction_year'])/np.timedelta64(1, 'Y')

In [8]:
data.drop('date_recorded',axis = 1, inplace = True)
data.drop('construction_year',axis =1, inplace = True)

# Geographical Features: longitude, latitude, gps_height,basin, subvillage, region, region_code,district_code,lga,ward.

In [9]:
geo_data = data[['longitude','latitude','gps_height','basin','subvillage',
                 'region','region_code','district_code','lga','ward']]

In [10]:
geo_data[['longitude','latitude','gps_height']].hist(figsize = (20,10))

array([[<matplotlib.axes.AxesSubplot object at 0x10af0b150>,
        <matplotlib.axes.AxesSubplot object at 0x10d750a50>],
       [<matplotlib.axes.AxesSubplot object at 0x10d7d7910>,
        <matplotlib.axes.AxesSubplot object at 0x10d83bed0>]], dtype=object)

In [11]:
geo_data.groupby(['region','region_code','district_code','basin','subvillage','lga','ward']).size()

region  region_code  district_code  basin        subvillage           lga      ward       
Arusha  2            1              Internal     Arkaria              Monduli  Engutoto       1
                                                 Emugurunanyoki       Monduli  Sepeko         1
                                                 Engasiti             Monduli  Makuyuni       1
                                                 Jangwani             Monduli  Esilalei       1
                                                 Lapalasek            Monduli  Engaruka       1
                                                 Larmaroro            Monduli  Monduli Juu    1
                                                 Losikito             Monduli  Sepeko         1
                                                 Losirwa              Monduli  Esilalei       1
                                                 Madukani             Monduli  Engaruka       1
                                             

In [12]:
for i in geo_data.columns:
    print i,len(geo_data[i].unique()),geo_data[i].unique()
    print ""

longitude 14390 [ 35.2907992   36.65670893  34.76786315 ...,  34.7398045   35.43273168
  34.76505448]

latitude 14390 [ -4.05969643  -3.30921425  -5.00434437 ...,  -4.58558667 -10.58415869
 -11.22601197]

gps_height 2157 [1996 1569 1567 ..., 1909 2202  640]

basin 9 ['Internal' 'Pangani' 'Ruvuma / Southern Coast' 'Rufiji' 'Lake Victoria'
 'Lake Tanganyika' 'Wami / Ruvu' 'Lake Rukwa' 'Lake Nyasa']

subvillage 8444 ['Magoma' 'Kimnyak' 'Msatu' ..., 'Kipengele' 'Kosoro' 'Kamba']

region 21 ['Manyara' 'Arusha' 'Singida' 'Lindi' 'Ruvuma' 'Iringa' 'Mtwara'
 'Kilimanjaro' 'Tabora' 'Mwanza' 'Pwani' 'Tanga' 'Shinyanga'
 'Dar es Salaam' 'Kigoma' 'Dodoma' 'Morogoro' 'Mbeya' 'Kagera' 'Mara'
 'Rukwa']

region_code 26 [21  2 13 80 10 11  9 90  3 14 19  6  4 17  7 16  1 24  5 12 99 18 20 15  8
 60]

district_code 20 [ 3  2 43  7  4 33  5  1  6 30  8 13 63 53 62 23 60 67  0 80]

lga 125 ['Mbulu' 'Arusha Rural' 'Singida Rural' 'Liwale' 'Mbinga' 'Kilolo'
 'Tandahimba' 'Newala' 'Siha' 'Hai' 'Simanjiro' 'I

In [13]:
geo_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14850 entries, 0 to 14849
Data columns (total 10 columns):
longitude        14850 non-null float64
latitude         14850 non-null float64
gps_height       14850 non-null int64
basin            14850 non-null object
subvillage       14751 non-null object
region           14850 non-null object
region_code      14850 non-null int64
district_code    14850 non-null int64
lga              14850 non-null object
ward             14850 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 1.1+ MB


Looking at the data above, for initial attempt we will;
1. Impute Longitude,Latitude and GPS Height by mean
2. For remaining features will encode the values individually.

In [14]:
mean_imputation = ['longitude','latitude','gps_height']
label_encode = ['basin','subvillage','region','region_code','district_code','lga','ward']

In [15]:
def geo_feat_eng(df):
    le = LabelEncoder()
    for i in df.columns:
        print ""
        print "Picked up Feature: "+i
        if i in mean_imputation:
            df[i].replace(0, np.nan, inplace=True)
            df[i].fillna(df[i].mean(),inplace = True)
            print "Imputed mean for Feature: "+i
        elif i in label_encode:
            le.fit(df[i])
            print "Fitting LabelEncoder on Feature: "+i
            df[i] = le.transform(df[i])
            print "Transformed Feature: "+i
        else:
            print "Ignored, Out of scope"
    return df.dtypes

In [16]:
geo_feat_eng(data)


Picked up Feature: id
Ignored, Out of scope

Picked up Feature: amount_tsh
Ignored, Out of scope

Picked up Feature: funder
Ignored, Out of scope

Picked up Feature: gps_height
Imputed mean for Feature: gps_height

Picked up Feature: installer
Ignored, Out of scope

Picked up Feature: longitude
Imputed mean for Feature: longitude

Picked up Feature: latitude
Imputed mean for Feature: latitude

Picked up Feature: wpt_name
Ignored, Out of scope

Picked up Feature: num_private
Ignored, Out of scope

Picked up Feature: basin
Fitting LabelEncoder on Feature: basin
Transformed Feature: basin

Picked up Feature: subvillage
Fitting LabelEncoder on Feature: subvillage
Transformed Feature: subvillage

Picked up Feature: region
Fitting LabelEncoder on Feature: region
Transformed Feature: region

Picked up Feature: region_code
Fitting LabelEncoder on Feature: region_code
Transformed Feature: region_code

Picked up Feature: district_code
Fitting LabelEncoder on Feature: district_code
Transformed F

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
  return aux[:-1][aux[1:] == aux[:-1]]


id                         int64
amount_tsh               float64
funder                    object
gps_height               float64
installer                 object
longitude                float64
latitude                 float64
wpt_name                  object
num_private                int64
basin                      int64
subvillage                 int64
region                     int64
region_code                int64
district_code              int64
lga                        int64
ward                       int64
population                 int64
public_meeting            object
recorded_by               object
scheme_management         object
scheme_name               object
permit                    object
extraction_type           object
extraction_type_group     object
extraction_type_class     object
management                object
management_group          object
payment                   object
payment_type              object
water_quality             object
quality_gr

# Remaining Features:

In [17]:
other_feats = [(i) for i in data.columns if data[i].dtype not in ['int64','float64']]

In [18]:
remaining_feats = []

In [19]:
print len(other_feats), other_feats

24 ['funder', 'installer', 'wpt_name', 'public_meeting', 'recorded_by', 'scheme_management', 'scheme_name', 'permit', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group']


In [20]:
# Lets check out features which seem to be repeating but the data might be at a different level.
def ret_grouped_view(cols):
    lcols = list(cols)
    return data.groupby(lcols).size()

In [21]:
# In scheme_names there were same strings but segregrated in diff groups due to caps, maybe should do this with
# everything before encoding.

#data['scheme_name'] = data['scheme_name'].str.lower()
ret_grouped_view(('scheme_management', 'scheme_name'))
#data['scheme_name'].value_counts()

# Tried to make sence of scheme names but it doesn't seem to have any logic, hence dropping name and adding
# to rem_feat list.

data.drop('scheme_name',axis = 1, inplace = True)
remaining_feats.append('scheme_management')

In [22]:
ret_grouped_view(('extraction_type', 'extraction_type_group','extraction_type_class'))

extraction_type    extraction_type_group  extraction_type_class
afridev            afridev                handpump                  438
cemo               other motorpump        motorpump                  18
climax             other motorpump        motorpump                   9
gravity            gravity                gravity                  6483
india mark ii      india mark ii          handpump                  629
india mark iii     india mark iii         handpump                   37
ksb                submersible            submersible               375
mono               mono                   motorpump                 763
nira/tanira        nira/tanira            handpump                 2051
other              other                  other                    1672
other - play pump  other handpump         handpump                   16
other - rope pump  rope pump              rope pump                 121
other - swn 81     other handpump         handpump                   55


In [23]:
# Dropping ext type and using other two
data.drop('extraction_type_group',axis = 1,inplace = True)
remaining_feats.append('extraction_type')
remaining_feats.append('extraction_type_class')

In [24]:
ret_grouped_view(('management', 'management_group'))

management        management_group
company           commercial            174
other             other                 239
other - school    other                  27
parastatal        parastatal            461
private operator  commercial            533
trust             commercial             27
unknown           unknown               122
vwc               user-group          10117
water authority   commercial            219
water board       user-group            755
wua               user-group            583
wug               user-group           1593
dtype: int64

In [25]:
# Dropping mgmt grp
data.drop('management_group',axis = 1, inplace=True)
remaining_feats.append('management')

In [26]:
ret_grouped_view(('water_quality', 'quality_group'))

water_quality       quality_group
coloured            colored            133
fluoride            fluoride            44
fluoride abandoned  fluoride             6
milky               milky              201
salty               salty             1226
salty abandoned     salty               84
soft                good             12687
unknown             unknown            469
dtype: int64

In [27]:
# Dropping quality_grp
data.drop('quality_group',axis = 1, inplace=True)
remaining_feats.append('water_quality')

In [28]:
ret_grouped_view(('quantity', 'quantity_group'))

quantity      quantity_group
dry           dry               1536
enough        enough            8336
insufficient  insufficient      3767
seasonal      seasonal          1025
unknown       unknown            186
dtype: int64

In [29]:
# Dropping quantity_group
data.drop('quantity_group',axis = 1, inplace=True)
remaining_feats.append('quantity')

In [30]:
ret_grouped_view(('source', 'source_type', 'source_class'))

source                source_type           source_class
dam                   dam                   surface          184
hand dtw              borehole              groundwater      234
lake                  river/lake            surface          185
machine dbh           borehole              groundwater     2747
other                 other                 unknown           49
rainwater harvesting  rainwater harvesting  surface          568
river                 river/lake            surface         2352
shallow well          shallow well          groundwater     4316
spring                spring                groundwater     4195
unknown               other                 unknown           20
dtype: int64

In [31]:
#As source seems to be cleanest using source dropping other two.
data.drop('source',inplace=True,axis=1)
data.drop('source_type',inplace=True,axis=1)
remaining_feats.append('source_class')

In [32]:
ret_grouped_view(('waterpoint_type', 'waterpoint_type_group'))

waterpoint_type              waterpoint_type_group
cattle trough                cattle trough              34
communal standpipe           communal standpipe       7106
communal standpipe multiple  communal standpipe       1508
dam                          dam                         1
hand pump                    hand pump                4396
improved spring              improved spring           175
other                        other                    1630
dtype: int64

In [33]:
data.drop('waterpoint_type_group',inplace=True,axis=1)
remaining_feats.append('waterpoint_type')

In [34]:
ret_grouped_view(('payment','payment_type'))

payment                payment_type
never pay              never pay       6364
other                  other            260
pay annually           annually         928
pay monthly            monthly         2097
pay per bucket         per bucket      2281
pay when scheme fails  on failure       928
unknown                unknown         1992
dtype: int64

In [35]:
data.drop('payment',inplace=True,axis=1)
remaining_feats.append('payment_type')

In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14850 entries, 0 to 14849
Data columns (total 30 columns):
id                       14850 non-null int64
amount_tsh               14850 non-null float64
funder                   13981 non-null object
gps_height               14850 non-null float64
installer                13973 non-null object
longitude                14850 non-null float64
latitude                 14850 non-null float64
wpt_name                 14850 non-null object
num_private              14850 non-null int64
basin                    14850 non-null int64
subvillage               14850 non-null int64
region                   14850 non-null int64
region_code              14850 non-null int64
district_code            14850 non-null int64
lga                      14850 non-null int64
ward                     14850 non-null int64
population               14850 non-null int64
public_meeting           14029 non-null object
recorded_by              14850 non-null object
sche

In [37]:
# There were non repeating objects which we need to account for:
for i in data.columns:
    if data[i].dtype not in ['int64','float64']:
        if i not in remaining_feats:
            print i

funder
installer
wpt_name
public_meeting
recorded_by
permit


In [38]:
#Also dropping pub meeting, rec by
data.drop('public_meeting',inplace=True,axis=1)
data.drop('recorded_by',inplace=True,axis=1)

remaining_feats.append('funder')
remaining_feats.append('installer')
remaining_feats.append('wpt_name')

In [39]:
remaining_feats

['scheme_management',
 'extraction_type',
 'extraction_type_class',
 'management',
 'water_quality',
 'quantity',
 'source_class',
 'waterpoint_type',
 'payment_type',
 'funder',
 'installer',
 'wpt_name']

In [40]:
for i in remaining_feats:
    print i, data[i].dtypes

scheme_management object
extraction_type object
extraction_type_class object
management object
water_quality object
quantity object
source_class object
waterpoint_type object
payment_type object
funder object
installer object
wpt_name object


In [41]:
def feat_eng(df):
    le = LabelEncoder()
    for i in df.columns:
        print ""
        print "Picked up Feature: "+i
        if i in remaining_feats:
            df[i] = df[i].str.lower()
            le.fit(df[i])
            print "Fitting LabelEncoder on Feature: "+i
            df[i] = le.transform(df[i])
            print "Transformed Feature: "+i
        else:
            print "Ignored, Out of scope"
    return df.dtypes

In [42]:
feat_eng(data)


Picked up Feature: id
Ignored, Out of scope

Picked up Feature: amount_tsh
Ignored, Out of scope

Picked up Feature: funder
Fitting LabelEncoder on Feature: funder
Transformed Feature: funder

Picked up Feature: gps_height
Ignored, Out of scope

Picked up Feature: installer
Fitting LabelEncoder on Feature: installer
Transformed Feature: installer

Picked up Feature: longitude
Ignored, Out of scope

Picked up Feature: latitude
Ignored, Out of scope

Picked up Feature: wpt_name
Fitting LabelEncoder on Feature: wpt_name
Transformed Feature: wpt_name

Picked up Feature: num_private
Ignored, Out of scope

Picked up Feature: basin
Ignored, Out of scope

Picked up Feature: subvillage
Ignored, Out of scope

Picked up Feature: region
Ignored, Out of scope

Picked up Feature: region_code
Ignored, Out of scope

Picked up Feature: district_code
Ignored, Out of scope

Picked up Feature: lga
Ignored, Out of scope

Picked up Feature: ward
Ignored, Out of scope

Picked up Feature: population
Ignored,

id                         int64
amount_tsh               float64
funder                     int64
gps_height               float64
installer                  int64
longitude                float64
latitude                 float64
wpt_name                   int64
num_private                int64
basin                      int64
subvillage                 int64
region                     int64
region_code                int64
district_code              int64
lga                        int64
ward                       int64
population                 int64
scheme_management          int64
permit                    object
extraction_type            int64
extraction_type_class      int64
management                 int64
payment_type               int64
water_quality              int64
quantity                   int64
source_class               int64
waterpoint_type            int64
age_of_pump_mnths        float64
dtype: object

In [43]:
data.dtypes

id                         int64
amount_tsh               float64
funder                     int64
gps_height               float64
installer                  int64
longitude                float64
latitude                 float64
wpt_name                   int64
num_private                int64
basin                      int64
subvillage                 int64
region                     int64
region_code                int64
district_code              int64
lga                        int64
ward                       int64
population                 int64
scheme_management          int64
permit                    object
extraction_type            int64
extraction_type_class      int64
management                 int64
payment_type               int64
water_quality              int64
quantity                   int64
source_class               int64
waterpoint_type            int64
age_of_pump_mnths        float64
dtype: object

In [44]:
# Handle the last obj
data['permit'].fillna(False,inplace = True)
data['permit'] = data.permit.map({True:0, False:1})

In [45]:
len(data.columns)

28

Creating a new DF so that all prev changes to data remains intact.

In [46]:
cleaned_data = pd.DataFrame(data)

In [47]:
cleaned_data.head()

Unnamed: 0,id,amount_tsh,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,permit,extraction_type,extraction_type_class,management,payment_type,water_quality,quantity,source_class,waterpoint_type,age_of_pump_mnths
0,50785,0.0,176,1996.0,208,35.290799,-4.059696,656,0,0,...,0,9,3,3,2,6,3,1,6,43.094656
1,51630,0.0,251,1569.0,220,36.656709,-3.309214,1780,0,5,...,0,3,0,7,2,6,2,0,1,43.094656
2,17168,0.0,0,1567.0,0,34.767863,-5.004344,9674,0,0,...,1,9,3,7,2,6,2,1,6,43.086443
3,45559,0.0,223,267.0,243,38.058046,-9.418672,5595,0,7,...,0,9,3,7,6,6,0,0,6,43.059063
4,49871,500.0,74,1260.0,84,35.006123,-10.950412,5706,0,7,...,0,3,0,9,1,6,1,0,1,43.23429


In [48]:
cleaned_data.drop('id',inplace=True,axis=1)

In [49]:
features = cleaned_data

In [50]:
features.head()

Unnamed: 0,amount_tsh,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,...,permit,extraction_type,extraction_type_class,management,payment_type,water_quality,quantity,source_class,waterpoint_type,age_of_pump_mnths
0,0.0,176,1996.0,208,35.290799,-4.059696,656,0,0,3917,...,0,9,3,3,2,6,3,1,6,43.094656
1,0.0,251,1569.0,220,36.656709,-3.309214,1780,0,5,2718,...,0,3,0,7,2,6,2,0,1,43.094656
2,0.0,0,1567.0,0,34.767863,-5.004344,9674,0,0,5399,...,1,9,3,7,2,6,2,1,6,43.086443
3,0.0,223,267.0,243,38.058046,-9.418672,5595,0,7,2796,...,0,9,3,7,6,6,0,0,6,43.059063
4,500.0,74,1260.0,84,35.006123,-10.950412,5706,0,7,3545,...,0,3,0,9,1,6,1,0,1,43.23429


In [51]:
sc = StandardScaler()
sc.fit(features)
std_features = sc.fit_transform(features)
std_features = pd.DataFrame(std_features,columns = features.columns)
std_features.head()

Unnamed: 0,amount_tsh,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,...,permit,extraction_type,extraction_type_class,management,payment_type,water_quality,quantity,source_class,waterpoint_type,age_of_pump_mnths
0,-0.128571,-0.788992,1.990258,-0.50988,0.057368,0.552598,-1.603863,-0.050821,-1.673368,-0.111603,...,-0.722809,0.701073,1.031309,-2.048393,-0.451705,0.274411,2.093099,1.779726,1.911787,0.934685
1,-0.128571,-0.5409,1.128935,-0.465209,0.587954,0.807803,-1.266739,-0.050821,0.354568,-0.603111,...,-0.722809,-0.828447,-0.796322,-0.112257,-0.451705,0.274411,0.842847,-0.533933,-0.886033,0.934685
2,-0.128571,-1.371182,1.124901,-1.284194,-0.145766,0.231366,1.100929,-0.050821,-1.673368,0.495915,...,1.383492,0.701073,1.031309,-0.112257,-0.451705,0.274411,0.842847,1.779726,1.911787,0.925825
3,-0.128571,-0.633521,-1.497392,-0.379587,1.132301,-1.269746,-0.122496,-0.050821,1.165742,-0.571136,...,-0.722809,0.701073,1.031309,-0.112257,1.728636,0.274411,-1.657657,-0.533933,1.911787,0.896291
4,0.070562,-1.126398,0.505636,-0.971491,-0.053214,-1.790621,-0.089203,-0.050821,1.165742,-0.264098,...,-0.722809,-0.828447,-0.796322,0.855811,-0.99679,0.274411,-0.407405,-0.533933,-0.886033,1.085308


In [52]:
std_features.to_csv('test_cleaned_data',index=False)

In [53]:
X = std_features

# RandomForestClassifier

In [54]:
rcp_in = open('RFC.pickle','rb')
rfc = pickle.load(rcp_in)

In [55]:
rfc_pred = rfc.predict(X)

In [56]:
rfc_pred

array([2, 0, 0, ..., 0, 0, 2])

In [57]:
rfc_submission = pd.DataFrame()

In [58]:
rfc_submission['id'] = data['id']

In [59]:
rfc_submission['status_group'] = rfc_pred

In [60]:
rfc_submission['status_group'] = rfc_submission.status_group.map({0:'functional', 1: 'functional needs repair', 2:'non functional'})

In [61]:
rfc_submission.head()

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional


In [62]:
rfc_submission.to_csv('rfc_submission',index=False)