In [1]:
import os
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Train data path
train_data_path = os.getcwd() + '/train.csv'
test_data_path = os.getcwd() + '/test.csv'

# Retrieving the train data
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [3]:
# Create a copy of the test data
test_data_copy = test_data.copy(deep=True)

In [4]:
# Categorizing interviewees according to age
def age_category(age):
    if age <= 17:
        return 'Teen'
    elif age > 17 and age <= 40:
        return 'Youth'
    elif age > 40 and age <= 60:
        return 'Adult'
    elif age > 60 and age <= 80:
        return 'Old'
    elif age > 80:
        return 'Very Old'

# Categorizing interviewees according to the number of the household
def household_category(size):
    if size <= 5:
        return 'Xsmall'
    elif size > 5 and size <= 9:
        return 'Small'
    elif size > 9 and size <= 15:
        return 'Medium'
    elif size > 15:
        return 'Large'
    
train_data['household_size'] = train_data['household_size'].apply(household_category)
train_data['age_of_respondent'] = train_data['age_of_respondent'].apply(age_category)

test_data_copy['household_size'] = test_data_copy['household_size'].apply(household_category)
test_data_copy['age_of_respondent'] = test_data_copy['age_of_respondent'].apply(age_category)

In [5]:
# Integer values for data encoding
country = {'Kenya':0, 'Rwanda':1, 'Tanzania':2, 'Uganda':3}
year = {2016:0, 2017:1, 2018:2}
bank_account = {'Yes':1, 'No':0}
location_type = {'Rural':0, 'Urban':1}
cellphone_access = {'Yes':0, 'No':1}
household_size = {'Xsmall':0, 'Small':1, 'Medium':2, 'Large':3}
age_of_respondent = {'Teen':0, 'Youth':1, 'Adult':2, 'Old':3, 'Very Old':4}
gender_of_respondent = {'Male':0, 'Female':1}
relationship_with_head = {'Spouse':0, 'Head of Household':1, 'Other relative':2, 'Child':3, 'Parent':4, 'Other non-relatives': 5}
marital_status = {'Married/Living together':0, 'Widowed':1, 'Single/Never Married':2, 'Divorced/Seperated':3, 'Dont know':4}
education_level = {'Secondary education':0, 'No formal education':1, 'Vocational/Specialised training':2, 'Primary education':3, 'Tertiary education': 4, 'Other/Dont know/RTA':5}
job_type = {'Self employed':0, 'Government Dependent':1, 'Formally employed Private':2, 'Informally employed':3, 'Formally employed Government':4, 'Farming and Fishing':5, 'Remittance Dependent':6, 'Other Income':7, 'Dont Know/Refuse to answer':8, 'No Income':9}

In [6]:
# Data encoding
train_data['country'] = train_data['country'].replace(country)
train_data['year'] = train_data['year'].replace(year)
train_data['bank_account'] = train_data['bank_account'].replace(bank_account)
train_data['location_type'] = train_data['location_type'].replace(location_type)
train_data['cellphone_access'] = train_data['cellphone_access'].replace(cellphone_access)
train_data['household_size'] = train_data['household_size'].replace(household_size)
train_data['age_of_respondent'] = train_data['age_of_respondent'].replace(age_of_respondent)
train_data['gender_of_respondent'] = train_data['gender_of_respondent'].replace(gender_of_respondent)
train_data['relationship_with_head'] = train_data['relationship_with_head'].replace(relationship_with_head)
train_data['marital_status'] = train_data['marital_status'].replace(marital_status)
train_data['education_level'] = train_data['education_level'].replace(education_level)
train_data['job_type'] = train_data['job_type'].replace(job_type)

test_data_copy['country'] = test_data_copy['country'].replace(country)
test_data_copy['year'] = test_data_copy['year'].replace(year)
test_data_copy['location_type'] = test_data_copy['location_type'].replace(location_type)
test_data_copy['cellphone_access'] = test_data_copy['cellphone_access'].replace(cellphone_access)
test_data_copy['household_size'] = test_data_copy['household_size'].replace(household_size)
test_data_copy['age_of_respondent'] = test_data_copy['age_of_respondent'].replace(age_of_respondent)
test_data_copy['gender_of_respondent'] = test_data_copy['gender_of_respondent'].replace(gender_of_respondent)
test_data_copy['relationship_with_head'] = test_data_copy['relationship_with_head'].replace(relationship_with_head)
test_data_copy['marital_status'] = test_data_copy['marital_status'].replace(marital_status)
test_data_copy['education_level'] = test_data_copy['education_level'].replace(education_level)
test_data_copy['job_type'] = test_data_copy['job_type'].replace(job_type)

In [7]:
# Drop the uniqueid column
del(train_data['uniqueid'])
del(test_data_copy['uniqueid'])

In [12]:
test_data_copy.tail()

Unnamed: 0,country,year,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
10081,3,2,0,1,0,3,1,0,0,3,0
10082,3,2,1,0,1,2,0,1,0,3,0
10083,3,2,1,0,0,1,0,1,2,0,7
10084,3,2,0,0,1,1,1,0,0,3,0
10085,3,2,1,0,1,0,0,2,2,0,7


In [9]:
# # Convert dataframe to numpy array
# train_data = np.array(train_data)
# test_data_copy = np.array(test_data_copy)

# # Split the training data to X and Y features
# x_features = np.delete(train_data, 1, axis=1)
# y_features = train_data[:, 2]

# # Split the data to train and test sets
# x_train, x_test, y_train, y_test = train_test_split(x_features, y_features, train_size=0.7)

# # Model
# clf = DecisionTreeClassifier()
# clf.fit(x_train, y_train)

# train_score = clf.score(x_train, y_train)
# test_score = clf.score(x_test, y_test)
# print(train_score, test_score)

# # Get the predictions
# y_pred = clf.predict(x_test)

# # Calculate the accuracy score of the model
# score = accuracy_score(y_test, y_pred)
# score

In [10]:
# test_data_pred = clf.predict(test_data_copy)

# with open('submission_file.csv', mode='w') as submission_file:
#     individual_writer = csv.writer(submission_file, delimiter=',')
    
#     individual_writer.writerow(['uniqueid', 'bank_account'])
    
#     for i in range(len(test_data)):
#         unique_id = test_data.loc[i]['uniqueid'] + ' x ' + test_data.loc[i]['country']
#         individual_writer.writerow([unique_id, test_data_pred[i]])