### Imports

In [1]:
import pandas as pd
import numpy as np
import math
pd.set_option('display.max_columns', None)

### Loading and viewing the data

In [2]:
train_raw = pd.read_csv('./Data/training_set_values.csv')
test_raw = pd.read_csv('./Data/test_set_values.csv')

train_raw['train'] = 1
test_raw['train'] = 0

data = pd.concat([train_raw, test_raw])


In [3]:
# Overview

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74250 entries, 0 to 14849
Data columns (total 41 columns):
id                       74250 non-null int64
amount_tsh               74250 non-null float64
date_recorded            74250 non-null object
funder                   69746 non-null object
gps_height               74250 non-null int64
installer                69718 non-null object
longitude                74250 non-null float64
latitude                 74250 non-null float64
wpt_name                 74250 non-null object
num_private              74250 non-null int64
basin                    74250 non-null object
subvillage               73780 non-null object
region                   74250 non-null object
region_code              74250 non-null int64
district_code            74250 non-null int64
lga                      74250 non-null object
ward                     74250 non-null object
population               74250 non-null int64
public_meeting           70095 non-null object
r

### Global variables

In [37]:
data_version = '0.5'

# Selected features based on FeatureSelection (on base model with all features and standard parameters)
feature_selection = 'id train amount_tsh funder installer longitude latitude basin region region_code district_code lga ward population public_meeting scheme_management scheme_name permit construction_year extraction_type management payment_type water_quality quantity source waterpoint_type longitude_imp_normal longitude_imp_random_choice latitude_imp_normal latitude_imp_random_choice operation_years extraction_type_group extraction_type_class quality_group source_class source_type management_group'.split()

# Drop the following features for intuitive feature selection
intuitive_features_drop = ['extraction_type_group','extraction_type_class','payment','quality_group','source_class','source_type','waterpoint_type_group','management_group','quantity_group','date_recorded','wpt_name']

int_var = ['population','gps_height', 'construction_year']
float_var = ['amount_tsh','longitude']

features_to_drop = ['num_private','recorded_by']

null_features = ['longitude','latitude','gps_height','population','construction_year','amount_tsh']
#no calculations for num_private since they are dropped later (too many missing values)

divisions = ['region', 'ward']
divisions_total = ['ward', 'region', 'overall']

# These will be scaled
num_features = ['latitude','longitude','operation_years','amount_tsh', 'gps_height', 'population']

# These will be factorized
cat_features = 'id date_recorded funder installer wpt_name basin subvillage region lga ward public_meeting scheme_management scheme_name permit extraction_type extraction_type_group extraction_type_class management management_group payment payment_type water_quality quality_group quantity quantity_group source source_type source_class waterpoint_type waterpoint_type_group'.split()

In [None]:
data.head()

### Data preparation

In [None]:
# Identify missing values in numerical data

for var in int_var:
    print('{}:'.format(var))
    display(data[var].min())
    display(len(data[data[var] == 0]))

for var in float_var:
    print('{}:'.format(var))
    display(data[var].min())
    display(len(data[data[var] == 0.0]))

print('latitude:')
display(data['latitude'].min())
display(len(data[(data['latitude'] > -0.001) & (data['latitude'] < 0.001)]))

In [None]:
# Replace zeros by NaN

for var in int_var:
    data[var].replace(0, np.nan, inplace=True)
    
for var in float_var:
    data[var].replace(0.0, np.nan, inplace=True)

data['latitude'].where((data['latitude'] < -0.001) | (data['latitude'] > 0.001), other= np.nan, inplace=True,axis=0)

In [None]:
# Logarithmic scaling of amount_tsh and population

data['amount_tsh']=data.apply(lambda row: np.log1p(row['amount_tsh']),axis=1)
data['population']=data.apply(lambda row: np.log1p(row['population']),axis=1)

In [None]:
# Split train and test data

train = data[data['train'] == 1]
test = data[data['train'] == 0]

### Imputation of missing values in numerical features

In [None]:
# Duplicate critical columnns for imputation based on normal distribution and random choice

for null_feature in null_features:
    #data['_'.join([null_feature, 'imp_mean-median'])] = data[null_feature]
    data['_'.join([null_feature, 'imp_normal'])] = data[null_feature]
    data['_'.join([null_feature, 'imp_random_choice'])] = data[null_feature]

In [None]:
data.head()

#### Imputation of numerical features by normal distribution

In [None]:
# Add columns for mean and standard deviation of critical features based on 'region', 'ward' and 'overall'

for null_feature in null_features:
    data['_'.join([null_feature, 'mean', 'overall'])] = train[null_feature].mean()
    data['_'.join([null_feature, 'std', 'overall'])] = train[null_feature].std()
    for division in divisions:
        new_feature_name_mean = '_'.join([null_feature, 'mean', division])
        new_feature_name_std = '_'.join([null_feature, 'std', division])
        
        calcs_mean = train.groupby(division)[null_feature].mean()
        calcs_std = train.groupby(division)[null_feature].std()
        for value in train[division].unique() :
            data.loc[data[division]==value, new_feature_name_mean] = calcs_mean[value]
            data.loc[data[division]==value, new_feature_name_std] = calcs_std[value]

In [None]:
# 1st step: Impute missing values with random numbers generated by normal distribution based on mean, std by 'ward'
# 2nd step (only applied on remaining null values): Impute missing values with random numbers generated by normal distribution based on mean, std by 'region'
# 3rd step (only applied on remaining null values): Impute missing values with random numbers generated by normal distribution based on mean, std by 'overall'

for null_feature in null_features:
    for division in divisions_total:
        data['_'.join([null_feature,'imp_normal'])] = data.apply(lambda row: np.random.normal(loc=row['_'.join([null_feature,'mean',division])], scale=row['_'.join([null_feature,'std',division])]) if math.isnan(row['_'.join([null_feature,'imp_normal'])]) else row['_'.join([null_feature,'imp_normal'])], axis=1)
        display('Missing values after imputation in {} by {}: {}'.format(null_feature, division, data['_'.join([null_feature,'imp_normal'])].isnull().sum()))

In [None]:
data.head()

#### Imputation of numerical features by random choice

In [None]:
# Add columns with list of values in corresponding group of 'region' and 'ward', respectively

for null_feature in null_features:
    overall_list = list(train[null_feature])
    overall_list = [x for x in overall_list if not math.isnan(x)]
    data['_'.join([null_feature, 'list', 'overall'])] = data.apply(lambda row: overall_list, axis=1)
    display(null_feature, 'overall list done')
    for division in divisions:
        feature_name = '_'.join([null_feature, 'list', division])
        lists = train.groupby(division)[null_feature].apply(list)
        data[feature_name] = data.apply(lambda row: list() if row[division] not in train[division].unique() else lists[row[division]], axis=1)
        data[feature_name] = data[feature_name].apply(lambda lst: [x for x in lst if not math.isnan(x)])
        data[feature_name] = data[feature_name].apply(lambda x: np.nan if not x else x)
        display('List for {} by {} created'.format(null_feature, division))
        

In [None]:
# 1st step: Impute missing values with empirical distribution grouped by 'ward'
# 2nd step (only applied on remaining null values): Impute missing values with empirical distribution grouped by 'region'
# 3rd step (only applied on remaining null values): Impute missing values with empirical distribution grouped by 'overall'

for null_feature in null_features:
    for division in divisions_total:        
        #data['_'.join([null_feature,'imp_random_choice'])] = data.apply(lambda row: np.random.choice(a=row['_'.join([null_feature,'list',division])]) if math.isnan(row['_'.join([null_feature,'imp_random_choice'])]) else row['_'.join([null_feature,'imp_random_choice'])], axis=1)
        data['_'.join([null_feature,'imp_random_choice'])] = data.apply(lambda row: row['_'.join([null_feature,'imp_random_choice'])] if not np.isnan(row['_'.join([null_feature,'imp_random_choice'])]).any() else (np.random.choice(a=row['_'.join([null_feature,'list',division])]) if not np.isnan(row['_'.join([null_feature,'list',division])]).any() else np.nan), axis=1)
        display('Missing values after imputation in {} by {}: {}'.format(null_feature, division, data['_'.join([null_feature,'imp_random_choice'])].isnull().sum()))
        

#### Imputation of numerical features by mean/median

In [None]:
# Add columns for median of critical integer features based on 'region', 'ward', 'overall'

float_var.append('latitude')

for var in int_var:
    data['_'.join([var, 'median', 'overall'])] = train[var].median()
    for division in divisions:
        new_feature_name_median = '_'.join([var, 'median', division])
        calcs_median = train.groupby(division)[var].median()
        for value in train[division].unique() :
            data.loc[data[division]==value, new_feature_name_median] = calcs_median[value]

In [None]:
# 1st step: Impute missing values with mean and median by 'ward'
# 2nd step (only applied on remaining null values): Impute missing values with mean and median by 'region'
# 3rd step (only applied on remaining null values): Impute missing values with overall mean and median

for var in float_var:
    for division in divisions_total:
        data[var] = data.apply(lambda row: row['_'.join([var,'mean',division])] if math.isnan(row[var]) else row[var], axis=1)
        display('Missing values after imputation in {} by {}: {}'.format(var, division, data[var].isnull().sum()))

for var in int_var:
    for division in divisions_total:
        data[var] = data.apply(lambda row: row['_'.join([var,'median',division])] if math.isnan(row[var]) else row[var], axis=1)
        display('Missing values after imputation in {} by {}: {}'.format(var, division, data[var].isnull().sum()))
        

In [None]:
data.isnull().sum()

In [None]:
data.to_csv('./Data/data_imputed_num_features.csv', index=False)

#### Load imputed data from previous data files and add mean/median imputation as additional columns
Can be used if only categorical features will be changed

In [60]:
data = pd.read_csv('./Data/data_imputed_num_features_balanced.csv')
display(data.shape)
display(data.head())

(111627, 53)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,train,longitude_imp_normal,longitude_imp_random_choice,latitude_imp_normal,latitude_imp_random_choice,gps_height_imp_normal,gps_height_imp_random_choice,population_imp_normal,population_imp_random_choice,construction_year_imp_normal,construction_year_imp_random_choice,amount_tsh_imp_normal,amount_tsh_imp_random_choice
0,69572,8.699681,2011-03-14,Roman,1390.0,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,4.70048,True,GeoData Consultants Ltd,VWC,Roman,False,1999.0,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,1,34.938093,34.938093,-9.856322,-9.856322,1390.0,1390.0,4.70048,4.70048,1999.0,1999.0,8.699681,8.699681
1,8776,5.82689,2013-03-06,Grumeti,1399.0,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,5.638355,,GeoData Consultants Ltd,Other,,True,2010.0,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1,34.698766,34.698766,-2.147466,-2.147466,1399.0,1399.0,5.638355,5.638355,2010.0,2010.0,6.52078,6.908755
2,34310,3.258097,2013-02-25,Lottery Club,686.0,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,5.525453,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009.0,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,1,37.460664,37.460664,-3.821329,-3.821329,686.0,686.0,5.525453,5.525453,2009.0,2009.0,3.258097,3.258097
3,67743,5.42029,2013-01-28,Unicef,263.0,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,4.077537,True,GeoData Consultants Ltd,VWC,,True,1986.0,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,1,38.486161,38.486161,-11.155298,-11.155298,263.0,263.0,4.077537,4.077537,1986.0,1986.0,6.906485,3.931826
4,19728,5.30167,2011-07-13,Action In A,1167.0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,5.01728,True,GeoData Consultants Ltd,,,True,2000.0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1,31.130847,31.130847,-1.825359,-1.825359,1240.292552,455.0,2.575311,0.693147,2016.001834,2006.0,2.826742,2.397895


In [61]:
train = data[data["train"] == 1]
test = data[data["train"] == 0]
display(train.shape)
display(test.shape)

(96777, 53)

(14850, 53)

### Load preprocessed numerical data

### Feature generation

In [41]:
# Create new feature that gives information about operational time

imputation_methods = ['normal', 'random_choice']
data['date_recorded'] = pd.to_datetime(data['date_recorded'])

data['operation_years'] = data['date_recorded'].dt.year - data['construction_year']

for method in imputation_methods:
    data['_'.join(['operation_years_imp', method])] = data['date_recorded'].dt.year - data['_'.join(['construction_year_imp', method])]
    data['_'.join(['operation_years_imp', method])] = data['_'.join(['operation_years_imp', method])].astype(int)
    

In [42]:
data.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,train,longitude_imp_normal,longitude_imp_random_choice,latitude_imp_normal,latitude_imp_random_choice,gps_height_imp_normal,gps_height_imp_random_choice,population_imp_normal,population_imp_random_choice,construction_year_imp_normal,construction_year_imp_random_choice,amount_tsh_imp_normal,amount_tsh_imp_random_choice,operation_years,operation_years_imp_normal,operation_years_imp_random_choice
0,69572,8.699681,2011-03-14,Roman,1390.0,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,4.70048,True,GeoData Consultants Ltd,VWC,Roman,False,1999.0,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,1,34.938093,34.938093,-9.856322,-9.856322,1390.0,1390.0,4.70048,4.70048,1999.0,1999.0,8.699681,8.699681,12.0,12,12
1,8776,5.82689,2013-03-06,Grumeti,1399.0,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,5.638355,,GeoData Consultants Ltd,Other,,True,2010.0,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1,34.698766,34.698766,-2.147466,-2.147466,1399.0,1399.0,5.638355,5.638355,2010.0,2010.0,6.52078,6.908755,3.0,3,3
2,34310,3.258097,2013-02-25,Lottery Club,686.0,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Others,5.525453,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009.0,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,1,37.460664,37.460664,-3.821329,-3.821329,686.0,686.0,5.525453,5.525453,2009.0,2009.0,3.258097,3.258097,4.0,4,4
3,67743,5.42029,2013-01-28,Unicef,263.0,UNICEF,38.486161,-11.155298,Others,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,4.077537,True,GeoData Consultants Ltd,VWC,,True,1986.0,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,1,38.486161,38.486161,-11.155298,-11.155298,263.0,263.0,4.077537,4.077537,1986.0,1986.0,6.906485,3.931826,27.0,27,27
4,19728,5.30167,2011-07-13,Others,1167.0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,5.01728,True,GeoData Consultants Ltd,,,True,2000.0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1,31.130847,31.130847,-1.825359,-1.825359,1240.292552,455.0,2.575311,0.693147,2016.001834,2006.0,2.826742,2.397895,11.0,-5,5


### Scaling of numerical features

In [None]:
# Scale numerical features
'''
scaler = MinMaxScaler()

for s in split:
    s[num_features] = scaler.fit_transform(s[num_features])
'''

### Drop irrelevant features

In [None]:
# Drop columns used for imputation and generation of random numbers

drop_columns = list()
measures = 'mean std list'.split()
for null_feature in null_features:
    for division in divisions_total:
        for measure in measures:
            drop_columns.append('_'.join([null_feature, measure, division]))
            
for var in int_var:
    for division in divisions_total:
        for measure in measures:
            drop_columns.append('_'.join([var, 'median', division]))
            
data.drop(columns=drop_columns, inplace=True)

In [43]:
# Drop redundant features and features that do not seem to have an impact

data.drop(columns=features_to_drop,axis=1,inplace=True)
data.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,train,longitude_imp_normal,longitude_imp_random_choice,latitude_imp_normal,latitude_imp_random_choice,gps_height_imp_normal,gps_height_imp_random_choice,population_imp_normal,population_imp_random_choice,construction_year_imp_normal,construction_year_imp_random_choice,amount_tsh_imp_normal,amount_tsh_imp_random_choice,operation_years,operation_years_imp_normal,operation_years_imp_random_choice
0,69572,8.699681,2011-03-14,Roman,1390.0,Roman,34.938093,-9.856322,none,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,4.70048,True,VWC,Roman,False,1999.0,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,1,34.938093,34.938093,-9.856322,-9.856322,1390.0,1390.0,4.70048,4.70048,1999.0,1999.0,8.699681,8.699681,12.0,12,12
1,8776,5.82689,2013-03-06,Grumeti,1399.0,GRUMETI,34.698766,-2.147466,Zahanati,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,5.638355,,Other,,True,2010.0,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1,34.698766,34.698766,-2.147466,-2.147466,1399.0,1399.0,5.638355,5.638355,2010.0,2010.0,6.52078,6.908755,3.0,3,3
2,34310,3.258097,2013-02-25,Lottery Club,686.0,World vision,37.460664,-3.821329,Kwa Mahundi,Pangani,Majengo,Manyara,21,4,Simanjiro,Others,5.525453,True,VWC,Nyumba ya mungu pipe scheme,True,2009.0,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,1,37.460664,37.460664,-3.821329,-3.821329,686.0,686.0,5.525453,5.525453,2009.0,2009.0,3.258097,3.258097,4.0,4,4
3,67743,5.42029,2013-01-28,Unicef,263.0,UNICEF,38.486161,-11.155298,Others,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,4.077537,True,VWC,,True,1986.0,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,1,38.486161,38.486161,-11.155298,-11.155298,263.0,263.0,4.077537,4.077537,1986.0,1986.0,6.906485,3.931826,27.0,27,27
4,19728,5.30167,2011-07-13,Others,1167.0,Artisan,31.130847,-1.825359,Shuleni,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,5.01728,True,,,True,2000.0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1,31.130847,31.130847,-1.825359,-1.825359,1240.292552,455.0,2.575311,0.693147,2016.001834,2006.0,2.826742,2.397895,11.0,-5,5


In [44]:
data.shape

(74250, 54)

### Preparation of categorical features

No grouping and factorizing of categorical features in data version 0.1; missing values replaced by 'Missing' only

#### Replace unique values that only appear in the test data set 
Either by 'Others' or by the most frequent value in the corresponding column of the train data set

In [45]:
data.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,train,longitude_imp_normal,longitude_imp_random_choice,latitude_imp_normal,latitude_imp_random_choice,gps_height_imp_normal,gps_height_imp_random_choice,population_imp_normal,population_imp_random_choice,construction_year_imp_normal,construction_year_imp_random_choice,amount_tsh_imp_normal,amount_tsh_imp_random_choice,operation_years,operation_years_imp_normal,operation_years_imp_random_choice
0,69572,8.699681,2011-03-14,Roman,1390.0,Roman,34.938093,-9.856322,none,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,4.70048,True,VWC,Roman,False,1999.0,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,1,34.938093,34.938093,-9.856322,-9.856322,1390.0,1390.0,4.70048,4.70048,1999.0,1999.0,8.699681,8.699681,12.0,12,12
1,8776,5.82689,2013-03-06,Grumeti,1399.0,GRUMETI,34.698766,-2.147466,Zahanati,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,5.638355,,Other,,True,2010.0,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1,34.698766,34.698766,-2.147466,-2.147466,1399.0,1399.0,5.638355,5.638355,2010.0,2010.0,6.52078,6.908755,3.0,3,3
2,34310,3.258097,2013-02-25,Lottery Club,686.0,World vision,37.460664,-3.821329,Kwa Mahundi,Pangani,Majengo,Manyara,21,4,Simanjiro,Others,5.525453,True,VWC,Nyumba ya mungu pipe scheme,True,2009.0,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,1,37.460664,37.460664,-3.821329,-3.821329,686.0,686.0,5.525453,5.525453,2009.0,2009.0,3.258097,3.258097,4.0,4,4
3,67743,5.42029,2013-01-28,Unicef,263.0,UNICEF,38.486161,-11.155298,Others,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,4.077537,True,VWC,,True,1986.0,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,1,38.486161,38.486161,-11.155298,-11.155298,263.0,263.0,4.077537,4.077537,1986.0,1986.0,6.906485,3.931826,27.0,27,27
4,19728,5.30167,2011-07-13,Others,1167.0,Artisan,31.130847,-1.825359,Shuleni,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,5.01728,True,,,True,2000.0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1,31.130847,31.130847,-1.825359,-1.825359,1240.292552,455.0,2.575311,0.693147,2016.001834,2006.0,2.826742,2.397895,11.0,-5,5


#### Choose features for feature selection - if necessary

In [None]:
#data = data[feature_selection]
data.drop(columns=intuitive_features_drop, axis=1, inplace=True)
data.head()

In [None]:
data.shape

In [46]:
# Set cat_features only to features that appear in selected features
#selected_cat_features = [value for value in cat_features if value in feature_selection]
# Drop cat features
#selected_cat_features = [value for value in cat_features if value not in intuitive_features_drop]
selected_cat_features = cat_features
selected_cat_features

['id',
 'date_recorded',
 'funder',
 'installer',
 'wpt_name',
 'basin',
 'subvillage',
 'region',
 'lga',
 'ward',
 'public_meeting',
 'scheme_management',
 'scheme_name',
 'permit',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quality_group',
 'quantity',
 'quantity_group',
 'source',
 'source_type',
 'source_class',
 'waterpoint_type',
 'waterpoint_type_group']

In [47]:
for cat in selected_cat_features:
    display('{}, {}: {}'.format(cat, 'train', train[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'test', test[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'data', data[cat].nunique()))

'id, train: 59400'

'id, test: 14850'

'id, data: 74250'

'date_recorded, train: 356'

'date_recorded, test: 331'

'date_recorded, data: 369'

'funder, train: 923'

'funder, test: 980'

'funder, data: 1244'

'installer, train: 1048'

'installer, test: 1091'

'installer, data: 1419'

'wpt_name, train: 4473'

'wpt_name, test: 10840'

'wpt_name, data: 13933'

'basin, train: 9'

'basin, test: 9'

'basin, data: 9'

'subvillage, train: 9864'

'subvillage, test: 8443'

'subvillage, data: 13597'

'region, train: 21'

'region, test: 21'

'region, data: 21'

'lga, train: 125'

'lga, test: 125'

'lga, data: 125'

'ward, train: 1466'

'ward, test: 1959'

'ward, data: 1978'

'public_meeting, train: 2'

'public_meeting, test: 2'

'public_meeting, data: 2'

'scheme_management, train: 12'

'scheme_management, test: 11'

'scheme_management, data: 12'

'scheme_name, train: 1889'

'scheme_name, test: 1789'

'scheme_name, data: 2214'

'permit, train: 2'

'permit, test: 2'

'permit, data: 2'

'extraction_type, train: 18'

'extraction_type, test: 17'

'extraction_type, data: 18'

'extraction_type_group, train: 13'

'extraction_type_group, test: 13'

'extraction_type_group, data: 13'

'extraction_type_class, train: 7'

'extraction_type_class, test: 7'

'extraction_type_class, data: 7'

'management, train: 12'

'management, test: 12'

'management, data: 12'

'management_group, train: 5'

'management_group, test: 5'

'management_group, data: 5'

'payment, train: 7'

'payment, test: 7'

'payment, data: 7'

'payment_type, train: 7'

'payment_type, test: 7'

'payment_type, data: 7'

'water_quality, train: 8'

'water_quality, test: 8'

'water_quality, data: 8'

'quality_group, train: 6'

'quality_group, test: 6'

'quality_group, data: 6'

'quantity, train: 5'

'quantity, test: 5'

'quantity, data: 5'

'quantity_group, train: 5'

'quantity_group, test: 5'

'quantity_group, data: 5'

'source, train: 10'

'source, test: 10'

'source, data: 10'

'source_type, train: 7'

'source_type, test: 7'

'source_type, data: 7'

'source_class, train: 3'

'source_class, test: 3'

'source_class, data: 3'

'waterpoint_type, train: 7'

'waterpoint_type, test: 7'

'waterpoint_type, data: 7'

'waterpoint_type_group, train: 6'

'waterpoint_type_group, test: 6'

'waterpoint_type_group, data: 6'

In [48]:
# Get list of features that contain values in the test which don't appear in the training data

unique_test_features = list()
for cat in selected_cat_features:
    if train[cat].nunique() < data[cat].nunique():
        unique_test_features.append(cat)
if 'id' in unique_test_features:
    unique_test_features.remove('id')
    selected_cat_features.remove('id')
if 'date_recorded' in unique_test_features:
    unique_test_features.remove('date_recorded')
    selected_cat_features.remove('date_recorded')
unique_test_features

['funder', 'installer', 'wpt_name', 'subvillage', 'ward', 'scheme_name']

In [49]:
# Set unique values that appear in the test dataset only to 'Others' if 'Others' appears in the training dataset, else set it to most frequent value in the corresponding column

for feature in unique_test_features:
    train_values = train[feature].unique().tolist()
    test_values = test[feature].unique().tolist()
    merged_values = pd.DataFrame(train_values).merge(pd.DataFrame(test_values), how='right', indicator=True)
    unique_test_values = list(merged_values[0].loc[merged_values['_merge'] == 'right_only'])
    replace_value = 'Others' if 'Others' in train_values else train[feature].mode()
    for value in unique_test_values:
        data.loc[data[feature] == value, feature] = replace_value  
        
        

In [50]:
for cat in selected_cat_features:
    display('{}, {}: {}'.format(cat, 'NaNs', data[cat].isnull().sum()))
    display('{}, {}: {}'.format(cat, 'train', train[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'test', test[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'overall', data[cat].nunique()))

'funder, NaNs: 4504'

'funder, train: 923'

'funder, test: 980'

'funder, overall: 923'

'installer, NaNs: 4532'

'installer, train: 1048'

'installer, test: 1091'

'installer, overall: 1048'

'wpt_name, NaNs: 0'

'wpt_name, train: 4473'

'wpt_name, test: 10840'

'wpt_name, overall: 4473'

'basin, NaNs: 0'

'basin, train: 9'

'basin, test: 9'

'basin, overall: 9'

'subvillage, NaNs: 470'

'subvillage, train: 9864'

'subvillage, test: 8443'

'subvillage, overall: 9864'

'region, NaNs: 0'

'region, train: 21'

'region, test: 21'

'region, overall: 21'

'lga, NaNs: 0'

'lga, train: 125'

'lga, test: 125'

'lga, overall: 125'

'ward, NaNs: 0'

'ward, train: 1466'

'ward, test: 1959'

'ward, overall: 1466'

'public_meeting, NaNs: 4155'

'public_meeting, train: 2'

'public_meeting, test: 2'

'public_meeting, overall: 2'

'scheme_management, NaNs: 4846'

'scheme_management, train: 12'

'scheme_management, test: 11'

'scheme_management, overall: 12'

'scheme_name, NaNs: 35258'

'scheme_name, train: 1889'

'scheme_name, test: 1789'

'scheme_name, overall: 1889'

'permit, NaNs: 3793'

'permit, train: 2'

'permit, test: 2'

'permit, overall: 2'

'extraction_type, NaNs: 0'

'extraction_type, train: 18'

'extraction_type, test: 17'

'extraction_type, overall: 18'

'extraction_type_group, NaNs: 0'

'extraction_type_group, train: 13'

'extraction_type_group, test: 13'

'extraction_type_group, overall: 13'

'extraction_type_class, NaNs: 0'

'extraction_type_class, train: 7'

'extraction_type_class, test: 7'

'extraction_type_class, overall: 7'

'management, NaNs: 0'

'management, train: 12'

'management, test: 12'

'management, overall: 12'

'management_group, NaNs: 0'

'management_group, train: 5'

'management_group, test: 5'

'management_group, overall: 5'

'payment, NaNs: 0'

'payment, train: 7'

'payment, test: 7'

'payment, overall: 7'

'payment_type, NaNs: 0'

'payment_type, train: 7'

'payment_type, test: 7'

'payment_type, overall: 7'

'water_quality, NaNs: 0'

'water_quality, train: 8'

'water_quality, test: 8'

'water_quality, overall: 8'

'quality_group, NaNs: 0'

'quality_group, train: 6'

'quality_group, test: 6'

'quality_group, overall: 6'

'quantity, NaNs: 0'

'quantity, train: 5'

'quantity, test: 5'

'quantity, overall: 5'

'quantity_group, NaNs: 0'

'quantity_group, train: 5'

'quantity_group, test: 5'

'quantity_group, overall: 5'

'source, NaNs: 0'

'source, train: 10'

'source, test: 10'

'source, overall: 10'

'source_type, NaNs: 0'

'source_type, train: 7'

'source_type, test: 7'

'source_type, overall: 7'

'source_class, NaNs: 0'

'source_class, train: 3'

'source_class, test: 3'

'source_class, overall: 3'

'waterpoint_type, NaNs: 0'

'waterpoint_type, train: 7'

'waterpoint_type, test: 7'

'waterpoint_type, overall: 7'

'waterpoint_type_group, NaNs: 0'

'waterpoint_type_group, train: 6'

'waterpoint_type_group, test: 6'

'waterpoint_type_group, overall: 6'

#### Group feature categories other than top categories into 'Others'

In [None]:
# Reduce dimension of categorical variables with high dimensionality 

'''dim_red_features = 'funder installer scheme_name lga ward'.split()
for feature in dim_red_features:
    train = train.assign(count = train.groupby(feature)[feature].transform('count')).sort_values(by = ['count',feature], ascending = [False,True])
    top_values = train.drop_duplicates('count')
    top_values = list(top_values.nlargest(10, 'count')[feature])
    data[feature] = data[feature].apply(lambda x: x if (x in top_values) | (str(x) == 'nan') else 'Others')'''

In [None]:
'''for cat in cat_features:
    display('{}, {}: {}'.format(cat, 'NaNs', data[cat].isnull().sum()))
    display('{}, {}: {}'.format(cat, 'train', train[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'test', test[cat].nunique()))
    display('{}, {}: {}'.format(cat, 'overall', data[cat].nunique()))'''

#### Replace missing values by 'Missing'

In [51]:
# Fill missing values in categorical features by 'Missing'

for feature in selected_cat_features:
    data[feature].replace(np.nan, 'Missing', inplace=True)

In [None]:
'''data['funder'].value_counts()
data['installer'].value_counts()
data['scheme_name'].value_counts()'''

In [52]:
data.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,train,longitude_imp_normal,longitude_imp_random_choice,latitude_imp_normal,latitude_imp_random_choice,gps_height_imp_normal,gps_height_imp_random_choice,population_imp_normal,population_imp_random_choice,construction_year_imp_normal,construction_year_imp_random_choice,amount_tsh_imp_normal,amount_tsh_imp_random_choice,operation_years,operation_years_imp_normal,operation_years_imp_random_choice
0,69572,8.699681,2011-03-14,Roman,1390.0,Roman,34.938093,-9.856322,none,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,4.70048,True,VWC,Roman,False,1999.0,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,1,34.938093,34.938093,-9.856322,-9.856322,1390.0,1390.0,4.70048,4.70048,1999.0,1999.0,8.699681,8.699681,12.0,12,12
1,8776,5.82689,2013-03-06,Grumeti,1399.0,GRUMETI,34.698766,-2.147466,Zahanati,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,5.638355,Missing,Other,Missing,True,2010.0,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1,34.698766,34.698766,-2.147466,-2.147466,1399.0,1399.0,5.638355,5.638355,2010.0,2010.0,6.52078,6.908755,3.0,3,3
2,34310,3.258097,2013-02-25,Lottery Club,686.0,World vision,37.460664,-3.821329,Kwa Mahundi,Pangani,Majengo,Manyara,21,4,Simanjiro,Others,5.525453,True,VWC,Nyumba ya mungu pipe scheme,True,2009.0,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,1,37.460664,37.460664,-3.821329,-3.821329,686.0,686.0,5.525453,5.525453,2009.0,2009.0,3.258097,3.258097,4.0,4,4
3,67743,5.42029,2013-01-28,Unicef,263.0,UNICEF,38.486161,-11.155298,Others,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,4.077537,True,VWC,Missing,True,1986.0,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,1,38.486161,38.486161,-11.155298,-11.155298,263.0,263.0,4.077537,4.077537,1986.0,1986.0,6.906485,3.931826,27.0,27,27
4,19728,5.30167,2011-07-13,Others,1167.0,Artisan,31.130847,-1.825359,Shuleni,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,5.01728,True,Missing,Missing,True,2000.0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1,31.130847,31.130847,-1.825359,-1.825359,1240.292552,455.0,2.575311,0.693147,2016.001834,2006.0,2.826742,2.397895,11.0,-5,5


#### Convert categorical features into numerical features by adding a column with their probability for each target class

In [None]:
'''train_labels = pd.read_csv('./Data/training_set_labels.csv')
train = train.merge(train_labels, on="id")
train.head()'''

In [None]:
'''for feature in cat_features:
    train['count'] = train.groupby(feature)[feature].transform('count')
train.head()'''

In [None]:
'''def get_percentage(groups, row, status_group, feature):
    try:
        sg_count = groups['count'].loc[(groups[feature] == row[feature]) & (groups['status_group'] == status_group)].item()
        total_count = train['count'].loc[train['id'] == row['id']].item()
        return sg_count / total_count
    except ValueError:
        return 0'''

In [None]:
'''status_groups = 'functional,non functional,functional needs repair'.split(',')
for feature in cat_features:
    groups = pd.DataFrame({'count': train.groupby([feature, 'status_group']).size()}).reset_index()
    for status_group in status_groups:
        data['_'.join(['pct', feature, status_group])] = data.apply(lambda row: get_percentage(groups=groups, row=row, status_group=status_group, feature=feature), axis=1)
    display(feature + ' done')'''
        

In [None]:
#data.head()


#### Factorize categorical features

In [53]:
# Factorize features for evaluations

for feature in selected_cat_features:
    data[feature] = pd.factorize(data[feature])[0]
data.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,train,longitude_imp_normal,longitude_imp_random_choice,latitude_imp_normal,latitude_imp_random_choice,gps_height_imp_normal,gps_height_imp_random_choice,population_imp_normal,population_imp_random_choice,construction_year_imp_normal,construction_year_imp_random_choice,amount_tsh_imp_normal,amount_tsh_imp_random_choice,operation_years,operation_years_imp_normal,operation_years_imp_random_choice
0,69572,8.699681,2011-03-14,0,1390.0,0,34.938093,-9.856322,0,0,0,0,11,5,0,0,4.70048,0,0,0,0,1999.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,34.938093,34.938093,-9.856322,-9.856322,1390.0,1390.0,4.70048,4.70048,1999.0,1999.0,8.699681,8.699681,12.0,12,12
1,8776,5.82689,2013-03-06,1,1399.0,1,34.698766,-2.147466,1,1,1,1,20,2,1,1,5.638355,1,1,1,1,2010.0,0,0,0,1,0,1,1,0,0,1,1,1,1,1,0,0,1,34.698766,34.698766,-2.147466,-2.147466,1399.0,1399.0,5.638355,5.638355,2010.0,2010.0,6.52078,6.908755,3.0,3,3
2,34310,3.258097,2013-02-25,2,686.0,2,37.460664,-3.821329,2,2,2,2,21,4,2,2,5.525453,0,0,2,1,2009.0,0,0,0,0,0,2,2,0,0,0,0,2,2,1,1,0,1,37.460664,37.460664,-3.821329,-3.821329,686.0,686.0,5.525453,5.525453,2009.0,2009.0,3.258097,3.258097,4.0,4,4
3,67743,5.42029,2013-01-28,3,263.0,3,38.486161,-11.155298,3,3,3,3,90,63,3,3,4.077537,0,0,1,1,1986.0,1,1,1,0,0,1,1,0,0,2,2,3,3,0,1,0,1,38.486161,38.486161,-11.155298,-11.155298,263.0,263.0,4.077537,4.077537,1986.0,1986.0,6.906485,3.931826,27.0,27,27
4,19728,5.30167,2011-07-13,4,1167.0,4,31.130847,-1.825359,4,1,4,4,18,1,4,4,5.01728,0,2,1,1,2000.0,0,0,0,2,1,1,1,0,0,3,3,1,1,1,0,0,1,31.130847,31.130847,-1.825359,-1.825359,1240.292552,455.0,2.575311,0.693147,2016.001834,2006.0,2.826742,2.397895,11.0,-5,5


#### OneHotEncoding

In [None]:
# Remove 'subvillage' from cat_features its number of unique values leads to too many dummies
#cat_features.remove('subvillage')
# Get dummies for categorical features and add them to dataframe
#data = pd.concat([data, pd.get_dummies(data[cat_features], dummy_na=True)], axis=1)

In [None]:
#data.shape

In [None]:
#data.head()

#### Binary Encoding

In [None]:
#data.head()

In [None]:
#data.shape

In [None]:
#ce_bin = ce.BinaryEncoder(cols=selected_cat_features)
#data = ce_bin.fit_transform(data)

In [None]:
#data.shape

In [None]:
#data.head()

#### Feature Hashing

In [None]:
#ce_hash = ce.HashingEncoder(cols=cat_features, n_components=len(cat_features * 20))
#data = ce_hash.fit_transform(data)

In [None]:
#data.head()

#### Convert date_recorded to year only

In [54]:
data['date_recorded'] = pd.to_datetime(data['date_recorded'])
data['date_recorded'] = data['date_recorded'].dt.year
data.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,train,longitude_imp_normal,longitude_imp_random_choice,latitude_imp_normal,latitude_imp_random_choice,gps_height_imp_normal,gps_height_imp_random_choice,population_imp_normal,population_imp_random_choice,construction_year_imp_normal,construction_year_imp_random_choice,amount_tsh_imp_normal,amount_tsh_imp_random_choice,operation_years,operation_years_imp_normal,operation_years_imp_random_choice
0,69572,8.699681,2011,0,1390.0,0,34.938093,-9.856322,0,0,0,0,11,5,0,0,4.70048,0,0,0,0,1999.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,34.938093,34.938093,-9.856322,-9.856322,1390.0,1390.0,4.70048,4.70048,1999.0,1999.0,8.699681,8.699681,12.0,12,12
1,8776,5.82689,2013,1,1399.0,1,34.698766,-2.147466,1,1,1,1,20,2,1,1,5.638355,1,1,1,1,2010.0,0,0,0,1,0,1,1,0,0,1,1,1,1,1,0,0,1,34.698766,34.698766,-2.147466,-2.147466,1399.0,1399.0,5.638355,5.638355,2010.0,2010.0,6.52078,6.908755,3.0,3,3
2,34310,3.258097,2013,2,686.0,2,37.460664,-3.821329,2,2,2,2,21,4,2,2,5.525453,0,0,2,1,2009.0,0,0,0,0,0,2,2,0,0,0,0,2,2,1,1,0,1,37.460664,37.460664,-3.821329,-3.821329,686.0,686.0,5.525453,5.525453,2009.0,2009.0,3.258097,3.258097,4.0,4,4
3,67743,5.42029,2013,3,263.0,3,38.486161,-11.155298,3,3,3,3,90,63,3,3,4.077537,0,0,1,1,1986.0,1,1,1,0,0,1,1,0,0,2,2,3,3,0,1,0,1,38.486161,38.486161,-11.155298,-11.155298,263.0,263.0,4.077537,4.077537,1986.0,1986.0,6.906485,3.931826,27.0,27,27
4,19728,5.30167,2011,4,1167.0,4,31.130847,-1.825359,4,1,4,4,18,1,4,4,5.01728,0,2,1,1,2000.0,0,0,0,2,1,1,1,0,0,3,3,1,1,1,0,0,1,31.130847,31.130847,-1.825359,-1.825359,1240.292552,455.0,2.575311,0.693147,2016.001834,2006.0,2.826742,2.397895,11.0,-5,5


### Split data into train and test data set

In [55]:
train_df = data[data["train"] == 1]
test_df = data[data["train"] == 0]

train_df.drop(["train"], axis=1, inplace=True)
test_df.drop(["train"], axis=1, inplace=True)

### Save data to csv files

In [59]:
pd.DataFrame(train_df).to_csv("./Data/train_cleaned_v" + data_version + ".csv", index=False)
pd.DataFrame(test_df).to_csv("./Data/test_cleaned_v" + data_version + ".csv", index=False)