In [74]:
import pandas as pd
import numpy as np
import re

from sklearn.svm import SVC  
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_columns', None)

In [75]:
train = pd.read_csv('./Data/train_cleaned_v0.5.csv')
test = pd.read_csv('./Data/test_cleaned_v0.5.csv')

labels = pd.read_csv('./Data/training_set_labels.csv')
train = train.merge(labels, on="id")
target = train.pop("status_group")

train['train'] = 1
test['train'] = 0

data = pd.concat([train,test])

data['date_recorded'] = pd.to_datetime(data['date_recorded'])
data['date_recorded'] = data['date_recorded'].dt.year

# Execute following cell to impute missing values with median

In [73]:
num_features = ['date_recorded', 'latitude','longitude','amount_tsh', 'gps_height', 'population', 'operation_years', 'construction_year']

data = data.drop(['gps_height_imp_random_choice', 'gps_height_imp_normal', 'population_imp_normal',
               'population_imp_random_choice', 'construction_year_imp_normal', 'construction_year_imp_random_choice',
               'amount_tsh_imp_normal', 'amount_tsh_imp_random_choice', 'longitude_imp_normal',
                 'longitude_imp_random_choice', 'latitude_imp_normal', 'latitude_imp_random_choice',
                 'operation_years_imp_normal', 'operation_years_imp_random_choice'], axis = 1)

# Execute following cell to impute missing values with random sample from normal distribution

In [54]:
num_features = ['date_recorded','latitude_imp_normal','longitude_imp_normal','amount_tsh_imp_normal',
                'gps_height_imp_normal', 'population_imp_normal', 'operation_years_imp_normal',
               'construction_year_imp_normal']

data = data.drop(['gps_height_imp_random_choice', 'gps_height', 'population',
               'population_imp_random_choice', 'construction_year', 'construction_year_imp_random_choice',
               'amount_tsh', 'amount_tsh_imp_random_choice', 'longitude',
                 'longitude_imp_random_choice', 'latitude', 'latitude_imp_random_choice',
                 'operation_years', 'operation_years_imp_random_choice'], axis = 1)

# Execute following cell to impute missing values with random choice from empirical distribution

In [76]:
num_features = ['date_recorded','latitude_imp_random_choice','longitude_imp_random_choice','amount_tsh_imp_random_choice',
                'gps_height_imp_random_choice', 'population_imp_random_choice',
                'operation_years_imp_random_choice', 'construction_year_imp_random_choice']

data = data.drop(['gps_height', 'gps_height_imp_normal', 'population_imp_normal',
               'population', 'construction_year_imp_normal', 'construction_year',
               'amount_tsh_imp_normal', 'amount_tsh', 'longitude_imp_normal',
                 'longitude', 'latitude_imp_normal', 'latitude',
                 'operation_years_imp_normal', 'operation_years'], axis = 1)

In [78]:
# Scale numerical features
scaler = MinMaxScaler()
scaler.fit(data[num_features])
data[num_features] = scaler.transform(data[num_features])

In [79]:
# Get categorical features for OneHotEncoding
cat_features = list(data.columns)
for feature in num_features:
    cat_features.remove(feature)

In [80]:
# Remove train and id column for OneHotEncoding
cat_features.remove('train')
cat_features.remove('id')

In [82]:
# Convert factorized categorical features to strings
data[cat_features] = data[cat_features].applymap(str)

In [83]:
# OneHotEncoding of categorical features
data = pd.concat([data, pd.get_dummies(data[cat_features], dummy_na=True)], axis=1)
data = data.drop(cat_features, axis = 1)

In [84]:
# extract training/test sets
train_df = data[data["train"] == 1]
test_df = data[data["train"] == 0]
train_df.drop(["train"], axis=1, inplace=True)
train_df.drop(['id'],axis=1, inplace=True)
test_df.drop(["train"], axis=1, inplace=True)

id_test = test_df['id']
test_df.drop(['id'],axis=1, inplace=True)

# Train model and predict values

In [85]:
svm = SVC()
svm.fit(train_df, target)

In [86]:
predictions = svm.predict(test_df)

predictions = pd.DataFrame(predictions)
predictions['id'] = id_test
predictions.columns = ['status_group','id']
predictions = predictions[['id','status_group']]
# convert into submission format

formatsub = pd.read_csv('./Data/submission_format.csv')
submission_format = pd.merge(formatsub, predictions, on=['id'], how='inner')
submission_format.drop(['status_group_x'],axis=1,inplace=True)
submission_format.columns = ['id','status_group']

submission_format.to_csv('./Results/submission_format_SVM_empirical.csv', index=False)
submission_format.head()