# Baseline model - logistic regression

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import datetime

In [2]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")
train_label = pd.read_csv("data/train_label.csv")

In [3]:
train_data.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group'],
      dtype='object')

In [4]:
train_label.columns

Index(['id', 'status_group'], dtype='object')

In [5]:
train_label.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [6]:
train_label['status_group'] = pd.Categorical(train_label['status_group'])
train_label['status_group'] = train_label['status_group'].cat.codes

In [7]:
train_data.set_index(['id'], inplace = True)
test_data.set_index(['id'], inplace = True)

In [8]:
columns_to_drop = ['date_recorded', 'funder', 'installer', 'longitude', 'latitude', 'wpt_name', 
                   'subvillage', 'region_code', 'lga', 'ward', 'public_meeting', 'recorded_by',
                   'scheme_management', 'scheme_name', 'permit', 'construction_year',
                   'extraction_type', 'extraction_type_class', 'management', 'management_group', 'payment_type',
                   'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source_type', 'source_class',
                   'waterpoint_type_group']

In [9]:
train_data = train_data.drop(columns_to_drop, axis = 1)
test_data = test_data.drop(columns_to_drop, axis = 1)

In [10]:
train_data.shape

(59400, 11)

In [11]:
test_data.shape

(14850, 11)

In [None]:
# TO DO
def get_dummies(df, dummy_columns):
    
    for i in range(0, len(dummy_columns)):
        colname = "df_" + str(i)
        colname = pd.get_dummies(df[dummy_columns[i]])
        df = pd.concat([df, colname], axis = 1)
    #df = df.drop(dummy_columns, axis = 1)
    return(df)
    

In [12]:
# basin, region, district_code, extraction_type_group, payment, source, waterpoint_type

dummy_columns = ['basin', 'region', 'district_code', 'extraction_type_group', 'payment', 'source', 'waterpoint_type']

for i in range(0, len(dummy_columns)):
    colname = "df_" + str(i)
    train_data[dummy_columns[i]] = pd.Categorical(train_data[dummy_columns[i]])
    train_data[dummy_columns[i]] = train_data[dummy_columns[i]].cat.codes
    colname = pd.get_dummies(train_data[dummy_columns[i]])
    train_data = pd.concat([train_data, colname], axis = 1)

In [13]:
train_data.shape

(59400, 98)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_label['status_group'] ,test_size=0.3)

In [15]:
X_train.shape

(41580, 98)

In [16]:
X_test.shape

(17820, 98)

In [17]:
len(y_train)

41580

In [18]:
# Create one-vs-rest logistic regression object
clf = LogisticRegression(random_state=0, multi_class='multinomial', solver='newton-cg')

In [19]:
# Train model
model = clf.fit(X_train, y_train)



In [20]:
model.predict(X_test)

array([0, 2, 0, ..., 0, 0, 0], dtype=int8)

In [21]:
# View predicted probabilities
model.predict_proba(X_test)

array([[0.53947369, 0.01915532, 0.441371  ],
       [0.09884129, 0.01714486, 0.88401385],
       [0.65958424, 0.0818675 , 0.25854826],
       ...,
       [0.68465234, 0.009334  , 0.30601365],
       [0.68113667, 0.02684391, 0.29201942],
       [0.84669949, 0.03178011, 0.1215204 ]])

## Predict on test data

In [22]:
for i in range(0, len(dummy_columns)):
    colname = "df_" + str(i)
    test_data[dummy_columns[i]] = pd.Categorical(test_data[dummy_columns[i]])
    test_data[dummy_columns[i]] = test_data[dummy_columns[i]].cat.codes
    colname = pd.get_dummies(test_data[dummy_columns[i]])
    test_data = pd.concat([test_data, colname], axis = 1)

In [23]:
test_data.shape

(14850, 98)

In [24]:
predictions = model.predict(test_data)

In [25]:
len(predictions)

14850

In [26]:
len(test_data.iloc[:, 0])

14850

In [27]:
test_data['status_group'] = predictions

In [29]:
test_data = test_data.reset_index()

In [31]:
final_df = test_data[['id', 'status_group']]

In [33]:
# Record current date-time.
currentDT = datetime.datetime.now()
currentDT = currentDT.strftime('%d%m%Y_%H%M')

In [None]:
filename = "data/submissions_" + currentDT + ".csv"
final_df.to_csv()