# Creating a logistic regression to predict Weekday trips

## Import the relevant libraries

In [1]:
import pandas as pd
import numpy as np

## Load the data

In [6]:
bike = pd.read_csv('New Data.csv')

In [7]:
bike = bike.dropna()

## Creating the targets

In [8]:
bike['wkend_ct'].median()

106.0

In [9]:
targets = np.where(bike['wkend_ct'] > 
                   bike['wkend_ct'].median(), 1, 0)

In [10]:
targets

array([1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 1])

In [11]:
bike['Excessive wkend_ct'] = targets

In [12]:
bike.head()

Unnamed: 0,Location,wkend_ct,LdUse,Tdir,RdWd,BikeInfra,SegNum,StLength,Excessive wkend_ct
0,49 Ave,120,IM,W,30,0,30,10891,1
1,46 Rd,73,MX,W,30,0,20,7364,0
2,44 Rd,12,U,A,30,0,9,2792,0
3,Vernon Blvd,566,R,T,42,1,79,18112,1
4,Broadway (1),1303,R,T,40,0,145,24855,1


# Dummy Variables

In [13]:
bike['Tdir'] = bike ['Tdir'].map ({'T':1, 'W':0, 'A':0})
bike ['LdUse'] = bike ['LdUse'].map ({'CR':0, 'IM':1, 'MX':2, 'OS':3, 'R': 4, 'U':5 })

In [14]:
bike.head()

Unnamed: 0,Location,wkend_ct,LdUse,Tdir,RdWd,BikeInfra,SegNum,StLength,Excessive wkend_ct
0,49 Ave,120,1,0,30,0,30,10891,1
1,46 Rd,73,2,0,30,0,20,7364,0
2,44 Rd,12,5,0,30,0,9,2792,0
3,Vernon Blvd,566,4,1,42,1,79,18112,1
4,Broadway (1),1303,4,1,40,0,145,24855,1


In [15]:
bike = bike.dropna()

## Check stop on the targets

In [16]:
targets.sum() / targets.shape[0]

0.4946236559139785

In [53]:
data_with_targets = bike.drop(['Excessive wkend_ct','wkend_ct','Location'], axis=1)

In [54]:
data_with_targets is bike

False

In [55]:
data_with_targets.head()

Unnamed: 0,LdUse,Tdir,RdWd,BikeInfra,SegNum,StLength
0,1,0,30,0,30,10891
1,2,0,30,0,20,7364
2,5,0,30,0,9,2792
3,4,1,42,1,79,18112
4,4,1,40,0,145,24855


## Selecting the inputs for the regression

In [56]:
data_with_targets.shape

(93, 6)

In [57]:
data_with_targets.iloc[:,:-1]

Unnamed: 0,LdUse,Tdir,RdWd,BikeInfra,SegNum
0,1,0,30,0,30
1,2,0,30,0,20
2,5,0,30,0,9
3,4,1,42,1,79
4,4,1,40,0,145
5,2,1,50,0,120
6,4,1,44,0,140
7,4,1,34,3,244
8,4,1,42,0,120
9,4,1,50,0,145


In [58]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

## Standardizing the data

In [59]:
from sklearn.preprocessing import StandardScaler

wkday_scaler = StandardScaler()

In [24]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [60]:
unscaled_inputs.columns.values

array(['LdUse', 'Tdir', 'RdWd', 'BikeInfra', 'SegNum'], dtype=object)

In [61]:
columns_to_omit = ['LdUse', 'Tdir', 'BikeInfra']

In [62]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [63]:
wkday_scaler = CustomScaler(columns_to_scale)

In [64]:
wkday_scaler.fit(unscaled_inputs)

CustomScaler(columns=['RdWd', 'SegNum'], copy=None, with_mean=None,
       with_std=None)

In [65]:
scaled_inputs = wkday_scaler.transform(unscaled_inputs)

In [66]:
scaled_inputs

Unnamed: 0,LdUse,Tdir,RdWd,BikeInfra,SegNum
0,1,0,-0.843007,0,-0.764842
1,2,0,-0.843007,0,-0.909298
2,5,0,-0.843007,0,-1.068200
3,4,1,0.364690,1,-0.057006
4,4,1,0.163407,0,0.896405
5,2,1,1.169822,0,0.535265
6,4,1,0.565973,0,0.824177
7,4,1,-0.440442,3,2.326522
8,4,1,0.364690,0,0.535265
9,4,1,1.169822,0,0.896405


In [67]:
scaled_inputs.shape

(93, 5)

## Spliting the data into train & test and shuffle

In [68]:
from sklearn.model_selection import train_test_split

### Split

In [69]:
train_test_split(scaled_inputs, targets)

[    LdUse  Tdir      RdWd  BikeInfra    SegNum
 55      1     1  3.182651          0  0.101896
 38      5     0  0.565973          2  0.694167
 82      1     0 -0.843007          0  0.029668
 34      4     0 -0.843007          2 -0.533712
 68      0     1  1.169822          2 -0.143680
 0       1     0 -0.843007          0 -0.764842
 41      4     0 -1.446856          3 -0.374810
 10      4     0 -0.843007          0 -0.057006
 83      2     1 -0.843007          0 -0.519266
 33      0     1  0.364690          1 -1.053754
 49      4     1  0.163407          0 -0.707059
 54      0     0 -0.843007          0 -0.302581
 17      3     0 -1.044290          0 -1.097091
 73      3     0 -0.239159          0  3.424390
 26      1     1  0.364690          0  0.491928
 77      3     0  1.572387          0  0.665275
 56      4     0 -0.440442          0 -0.764842
 88      4     1  0.163407          3 -0.114788
 81      3     1  1.169822          2  0.174124
 91      4     0 -0.843007          0 -0

In [70]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, #train_size = 0.8, 
                                                                            test_size = 0.2, random_state = 20)

In [71]:
print (x_train.shape, y_train.shape)

(74, 5) (74,)


In [72]:
print (x_test.shape, y_test.shape)

(19, 5) (19,)


## Logistic regression with sklearn

In [73]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training the model

In [74]:
reg = LogisticRegression()

In [75]:
reg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [76]:
reg.score(x_train,y_train)

0.7702702702702703

### Finding the intercept and coefficients

In [77]:
reg.intercept_

array([-0.77841962])

In [78]:
reg.coef_

array([[0.13369064, 0.17418742, 0.59865063, 0.62191346, 0.45473504]])

In [79]:
unscaled_inputs.columns.values

array(['LdUse', 'Tdir', 'RdWd', 'BikeInfra', 'SegNum'], dtype=object)

In [80]:
feature_name = unscaled_inputs.columns.values

In [84]:
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,Coefficient
0,LdUse,0.133691
1,Tdir,0.174187
2,RdWd,0.598651
3,BikeInfra,0.621913
4,SegNum,0.454735


In [85]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.77842
1,LdUse,0.133691
2,Tdir,0.174187
3,RdWd,0.598651
4,BikeInfra,0.621913
5,SegNum,0.454735


## Interpreting the coefficients

In [86]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [87]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-0.77842,0.459131
1,LdUse,0.133691,1.143039
2,Tdir,0.174187,1.190279
3,RdWd,0.598651,1.819662
4,BikeInfra,0.621913,1.862488
5,SegNum,0.454735,1.575756


In [88]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
4,BikeInfra,0.621913,1.862488
3,RdWd,0.598651,1.819662
5,SegNum,0.454735,1.575756
2,Tdir,0.174187,1.190279
1,LdUse,0.133691,1.143039
0,Intercept,-0.77842,0.459131


## Testing the model

In [None]:
reg.score(x_test,y_test)

In [None]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

In [None]:
predicted_proba.shape

In [None]:
predicted_proba[:,1]

## Save the model

In [None]:
import pickle

In [None]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [None]:
with open('scaler','wb') as file:
    pickle.dump(wkday_scaler, file)