# Baseline model - logistic regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import datetime
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")
train_label = pd.read_csv("data/train_label.csv")

In [3]:
train_data.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group'],
      dtype='object')

In [4]:
train_label.columns

Index(['id', 'status_group'], dtype='object')

In [5]:
train_label.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [6]:
train_label.status_group.value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [7]:
train_label['status_group_cat'] = pd.Categorical(train_label['status_group'])
train_label['status_group_cat'] = train_label['status_group_cat'].cat.codes

In [8]:
train_label.head()

Unnamed: 0,id,status_group,status_group_cat
0,69572,functional,0
1,8776,functional,0
2,34310,functional,0
3,67743,non functional,2
4,19728,functional,0


In [9]:
train_data.set_index(['id'], inplace = True)
test_data.set_index(['id'], inplace = True)

In [10]:
columns_to_drop = ['date_recorded', 'funder', 'installer', 'longitude', 'latitude', 'wpt_name', 
                   'subvillage', 'region_code', 'lga', 'ward', 'public_meeting', 'recorded_by',
                   'scheme_management', 'scheme_name', 'permit', 'construction_year',
                   'extraction_type', 'extraction_type_class', 'management', 'management_group', 'payment_type',
                   'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source_type', 'source_class',
                   'waterpoint_type_group']

In [11]:
train_data = train_data.drop(columns_to_drop, axis = 1)
test_data = test_data.drop(columns_to_drop, axis = 1)

In [12]:
train_data.shape

(59400, 11)

In [13]:
test_data.shape

(14850, 11)

In [14]:
# TO DO

'''def get_dummies(df, dummy_columns):
    
    for i in range(0, len(dummy_columns)):
        colname = "df_" + str(i)
        colname = pd.get_dummies(df[dummy_columns[i]])
        df = pd.concat([df, colname], axis = 1)
    #df = df.drop(dummy_columns, axis = 1)
    return(df)
    '''

'def get_dummies(df, dummy_columns):\n    \n    for i in range(0, len(dummy_columns)):\n        colname = "df_" + str(i)\n        colname = pd.get_dummies(df[dummy_columns[i]])\n        df = pd.concat([df, colname], axis = 1)\n    #df = df.drop(dummy_columns, axis = 1)\n    return(df)\n    '

In [15]:
# basin, region, district_code, extraction_type_group, payment, source, waterpoint_type

dummy_columns = ['basin', 'region', 'district_code', 'extraction_type_group', 'payment', 'source', 'waterpoint_type']

for i in range(0, len(dummy_columns)):
    colname = "df_" + str(i)
    train_data[dummy_columns[i]] = pd.Categorical(train_data[dummy_columns[i]])
    train_data[dummy_columns[i]] = train_data[dummy_columns[i]].cat.codes
    colname = pd.get_dummies(train_data[dummy_columns[i]])
    train_data = pd.concat([train_data, colname], axis = 1)

In [16]:
train_data.shape

(59400, 98)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_label['status_group_cat'] ,test_size=0.3)

In [18]:
X_train.shape

(41580, 98)

In [19]:
X_test.shape

(17820, 98)

In [20]:
len(y_train)

41580

In [21]:
# Create one-vs-rest logistic regression object
clf = LogisticRegression(random_state=0, multi_class='multinomial', solver='newton-cg')

In [22]:
# Train model
model = clf.fit(X_train, y_train)



In [23]:
validation_preds = model.predict(X_test)

In [24]:
# View predicted probabilities
validation_pred_prob = model.predict_proba(X_test)

In [25]:
confusion_matrix(y_test, validation_preds)

array([[8193,   25, 1369],
       [ 973,   72,  294],
       [3142,   18, 3734]])

In [26]:
accuracy_score(y_test, validation_preds)

0.6733445566778901

## Predict on test data

In [27]:
for i in range(0, len(dummy_columns)):
    colname = "df_" + str(i)
    test_data[dummy_columns[i]] = pd.Categorical(test_data[dummy_columns[i]])
    test_data[dummy_columns[i]] = test_data[dummy_columns[i]].cat.codes
    colname = pd.get_dummies(test_data[dummy_columns[i]])
    test_data = pd.concat([test_data, colname], axis = 1)

In [28]:
test_data.shape

(14850, 98)

In [29]:
predictions = model.predict(test_data)

In [30]:
len(predictions)

14850

In [31]:
len(test_data.iloc[:, 0])

14850

In [32]:
test_data['status_group_num'] = predictions

In [33]:
test_data = test_data.reset_index()

In [34]:
test_data.loc[test_data.status_group_num == 0, 'status_group'] = 'functional' 
test_data.loc[test_data.status_group_num == 1, 'status_group'] = 'functional needs repair' 
test_data.loc[test_data.status_group_num == 2, 'status_group'] = 'non functional' 

In [35]:
final_df = test_data[['id', 'status_group']]

In [36]:
final_df.head()

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,non functional
3,45559,non functional
4,49871,functional


In [37]:
# Record current date-time.
currentDT = datetime.datetime.now()
currentDT = currentDT.strftime('%d%m%Y_%H%M')

In [39]:
filename = "data/submissions_" + currentDT + ".csv"
final_df.to_csv(filename, index = False, encoding = 'utf-8')