### Comparing various logistic regression models on pumps dataset

In [2]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np 
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn import linear_model 
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

In [3]:
# training data

train_features = pd.read_csv('X_train_pumps.csv', index_col=0)
train_labels = pd.read_csv('y_train_pumps.csv', index_col=0)

train_features.shape, train_labels.shape

((59400, 39), (59400, 1))

In [4]:
# test data

test_features = pd.read_csv('X_test_pumps.csv', index_col=0)

test_features.shape

(14850, 39)

### data wrangling/feature engineering

In [6]:
def wrangle_data(df):
    # drop columns that do not appear to provide no obvious information or are repetitive 
    df = df.drop(['funder', 'installer','date_recorded', 'wpt_name', 'num_private', 'subvillage','region_code',
                  'lga', 'ward', 'public_meeting', 'recorded_by','scheme_management', 'scheme_name', 
                  'permit','extraction_type', 'extraction_type_group', 'management', 'management_group',
                  'payment', 'water_quality', 'quantity', 'source', 'source_class', 
                  'waterpoint_type_group','construction_year'], axis=1)
    
    # dummies for categorical columns
    dummy_list = ['basin', 'region','district_code','extraction_type_class','payment_type','quality_group',
                  'quantity_group','source_type','waterpoint_type']
    for each in dummy_list:
        try:
            df = pd.get_dummies(data=df, prefix=dummy_list, columns=dummy_list, drop_first=True)
        except:
            Exception
            continue
            
    # longtitide: replacing 0 with mean 
    df['longitude'].replace(0.000000, 34.213823, inplace=True)
    
    # MinMax Scaling of numerical columns
    scale = MinMaxScaler()
    
    amount_tsh_scaled = scale.fit_transform(df[['amount_tsh']])
    df['amount_tsh_scaled'] = amount_tsh_scaled
    df = df.drop(['amount_tsh'], axis=1)
    
    gps_height_scaled = scale.fit_transform(df[['gps_height']])
    df['gps_height_scaled'] = gps_height_scaled
    df = df.drop(['gps_height'], axis=1)
    
    longitude_scaled = scale.fit_transform(df[['longitude']])
    df['longitude_scaled'] = longitude_scaled
    df = df.drop(['longitude'], axis=1)
    
    latitude_scaled = scale.fit_transform(df[['latitude']])
    df['latitude_scaled'] = latitude_scaled
    df = df.drop(['latitude'], axis=1)
    
    population_scaled = scale.fit_transform(df[['population']])
    df['population_scaled'] = population_scaled
    df = df.drop(['population'], axis=1)
    
    return df     

In [7]:
train_features = wrangle_data(train_features)

In [8]:
test_features = wrangle_data(test_features)

In [9]:
# both data sets have same number of features 
train_features.shape, test_features.shape

((59400, 85), (14850, 85))

In [10]:
X = train_features
y = train_labels
X.shape, y.shape

((59400, 85), (59400, 1))

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((47520, 85), (11880, 85), (47520, 1), (11880, 1))

#### Logistic Regression 

In [13]:
# Logistic Regression: test accuracy - 0.73

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
y_pred_logreg = logreg.predict(X_test)

In [31]:
logreg_train_score = logreg.score(X_train, y_train)
logreg_test_acc = accuracy_score(y_test, y_pred_logreg)
logreg_test_precision = precision_score(y_test, y_pred_logreg, average='weighted')
logreg_test_recall = recall_score(y_test, y_pred_logreg, average='weighted')

In [15]:
print(classification_report(y_test, y_pred_logreg))

                         precision    recall  f1-score   support

             functional       0.71      0.91      0.79      6457
functional needs repair       0.55      0.07      0.12       851
         non functional       0.79      0.60      0.68      4572

               accuracy                           0.73     11880
              macro avg       0.68      0.53      0.53     11880
           weighted avg       0.73      0.73      0.70     11880



#### Random Forest Classifier

In [16]:
# Random Forest Classifier: test accuracy - 0.79

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [17]:
y_pred_rf = rf.predict(X_test)

In [29]:
rf_train_score = rf.score(X_train, y_train)
rf_test_acc = accuracy_score(y_test, y_pred_rf)
rf_test_precision = precision_score(y_test, y_pred_rf, average='weighted')
rf_test_recall = recall_score(y_test, y_pred_rf, average='weighted')

In [18]:
print(classification_report(y_test, y_pred_rf))

                         precision    recall  f1-score   support

             functional       0.80      0.86      0.83      6457
functional needs repair       0.49      0.34      0.40       851
         non functional       0.81      0.78      0.79      4572

               accuracy                           0.79     11880
              macro avg       0.70      0.66      0.68     11880
           weighted avg       0.78      0.79      0.78     11880



#### Stochastic Gradient Descent Classifier

In [19]:
# Stochastic Gradient Descent Classifier: test accuracy - 0.72

sgd = SGDClassifier(max_iter=5, tol=None)
sgd.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=None,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [20]:
y_pred_sgd = sgd.predict(X_test)

In [30]:
sgd_train_score = sgd.score(X_train, y_train)
sgd_test_acc = accuracy_score(y_test, y_pred_sgd)
sgd_test_precision = precision_score(y_test, y_pred_sgd, average='weighted')
sgd_test_recall = recall_score(y_test, y_pred_sgd, average='weighted')

In [21]:
print(classification_report(y_test, y_pred_sgd))

                         precision    recall  f1-score   support

             functional       0.69      0.93      0.79      6457
functional needs repair       0.36      0.02      0.03       851
         non functional       0.81      0.57      0.67      4572

               accuracy                           0.72     11880
              macro avg       0.62      0.50      0.50     11880
           weighted avg       0.72      0.72      0.69     11880



#### Support Vector Classifier 

In [22]:
# Linear Support Vector Classifier: test accuracy - 0.73 (why this version of SVC?)

linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [23]:
y_pred_linear_svc = linear_svc.predict(X_test)

In [32]:
linear_svc_train_score = linear_svc.score(X_train, y_train)
linear_svc_test_acc = accuracy_score(y_test, y_pred_linear_svc)
linear_svc_test_precision = precision_score(y_test, y_pred_linear_svc, average='weighted')
linear_svc_test_recall = recall_score(y_test, y_pred_linear_svc, average='weighted')

In [24]:
print(classification_report(y_test, y_pred_linear_svc))

                         precision    recall  f1-score   support

             functional       0.70      0.92      0.79      6457
functional needs repair       0.00      0.00      0.00       851
         non functional       0.80      0.59      0.68      4572

               accuracy                           0.73     11880
              macro avg       0.50      0.50      0.49     11880
           weighted avg       0.69      0.73      0.69     11880



#### Decision Tree Classifier 

In [25]:
# Decision Tree Classifier: test accuracy - 0.75

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [26]:
y_pred_dt = dt.predict(X_test)

In [33]:
dt_train_score = dt.score(X_train, y_train)
dt_test_acc = accuracy_score(y_test, y_pred_dt)
dt_test_precision = precision_score(y_test, y_pred_dt, average='weighted')
dt_test_recall = recall_score(y_test, y_pred_dt, average='weighted')

In [27]:
print(classification_report(y_test, y_pred_dt))

                         precision    recall  f1-score   support

             functional       0.79      0.79      0.79      6457
functional needs repair       0.36      0.38      0.37       851
         non functional       0.76      0.76      0.76      4572

               accuracy                           0.75     11880
              macro avg       0.64      0.64      0.64     11880
           weighted avg       0.75      0.75      0.75     11880



In [34]:
# summary of results

results = pd.DataFrame({
    'Model':['Logistic Regression', 'Random Forest Classifier', 'SGD Classifier', 
              'Linear SVC', 'Decision Tree'],
    'Training Score':[logreg_train_score, rf_train_score, sgd_train_score,
                      linear_svc_train_score, dt_train_score],
    'Accuracy':[logreg_test_acc, rf_test_acc, sgd_test_acc, linear_svc_test_acc, dt_test_acc],
    'Precision':[logreg_test_precision, rf_test_precision, sgd_test_precision, linear_svc_test_precision,
                 dt_test_precision],
    'Recall':[logreg_test_recall, rf_test_recall, sgd_test_recall, linear_svc_test_recall, dt_test_recall]})
results_df = results.sort_values(by='Accuracy', ascending=False)
results_df = results_df.set_index('Model')
results_df

Unnamed: 0_level_0,Training Score,Accuracy,Precision,Recall
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Random Forest Classifier,0.99354,0.789141,0.78255,0.789141
Decision Tree,0.993582,0.748316,0.750727,0.748316
Logistic Regression,0.731376,0.729798,0.7276,0.729798
Linear SVC,0.727315,0.727609,0.687326,0.727609
SGD Classifier,0.724832,0.724832,0.716047,0.724832


In [35]:
# Attn: accuracy, precision and recall values here are weighted averages of all the 3 classes 
# for individual classes, refer to classification reports above 

In [36]:
# Overall, Random Forest Classifiers appears to be the best
# Will use this for subsequent analysis