# Creating a logistic regression to predict Weekend trips

## Import the relevant libraries

In [1]:
import pandas as pd
import numpy as np

## Load the data

In [2]:
bike = pd.read_csv('New Data.csv')

In [3]:
bike = bike.dropna()

## Creating the targets

In [4]:
bike['wkend_ct'].median()

106.0

In [5]:
targets = np.where(bike['wkend_ct'] > 
                   bike['wkend_ct'].median(), 1, 0)

In [6]:
targets

array([1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 1])

In [7]:
bike['Excessive wkend_ct'] = targets

In [8]:
bike.head()

Unnamed: 0,Location,wkend_ct,LdUse,Tdir,RdWd,BikeInfra,SegNum,StLength,Excessive wkend_ct
0,49 Ave,120,IM,W,30,0,30,10891,1
1,46 Rd,73,MX,W,30,0,20,7364,0
2,44 Rd,12,U,A,30,0,9,2792,0
3,Vernon Blvd,566,R,T,42,1,79,18112,1
4,Broadway (1),1303,R,T,40,0,145,24855,1


# Dummy Variables

In [9]:
bike['Tdir'] = bike ['Tdir'].map ({'T':1, 'W':0, 'A':0})
bike ['LdUse'] = bike ['LdUse'].map ({'CR':0, 'IM':1, 'MX':2, 'OS':3, 'R': 4, 'U':5 })

In [10]:
bike.head()

Unnamed: 0,Location,wkend_ct,LdUse,Tdir,RdWd,BikeInfra,SegNum,StLength,Excessive wkend_ct
0,49 Ave,120,1,0,30,0,30,10891,1
1,46 Rd,73,2,0,30,0,20,7364,0
2,44 Rd,12,5,0,30,0,9,2792,0
3,Vernon Blvd,566,4,1,42,1,79,18112,1
4,Broadway (1),1303,4,1,40,0,145,24855,1


In [11]:
bike = bike.dropna()

## Check stop on the targets

In [12]:
targets.sum() / targets.shape[0]

0.4946236559139785

In [13]:
data_with_targets = bike.drop(['wkend_ct','Location'], axis=1)

In [14]:
data_with_targets is bike

False

In [15]:
data_with_targets.head()

Unnamed: 0,LdUse,Tdir,RdWd,BikeInfra,SegNum,StLength,Excessive wkend_ct
0,1,0,30,0,30,10891,1
1,2,0,30,0,20,7364,0
2,5,0,30,0,9,2792,0
3,4,1,42,1,79,18112,1
4,4,1,40,0,145,24855,1


## Selecting the inputs for the regression

In [16]:
data_with_targets.shape

(93, 7)

In [17]:
data_with_targets.iloc[:,:-1]

Unnamed: 0,LdUse,Tdir,RdWd,BikeInfra,SegNum,StLength
0,1,0,30,0,30,10891
1,2,0,30,0,20,7364
2,5,0,30,0,9,2792
3,4,1,42,1,79,18112
4,4,1,40,0,145,24855
5,2,1,50,0,120,21285
6,4,1,44,0,140,27152
7,4,1,34,3,244,49860
8,4,1,42,0,120,21285
9,4,1,50,0,145,24855


In [18]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

## Standardizing the data

In [19]:
from sklearn.preprocessing import StandardScaler

wkday_scaler = StandardScaler()

In [20]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [21]:
unscaled_inputs.columns.values

array(['LdUse', 'Tdir', 'RdWd', 'BikeInfra', 'SegNum', 'StLength'],
      dtype=object)

In [22]:
columns_to_omit = ['LdUse', 'Tdir', 'BikeInfra']

In [23]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [24]:
wkday_scaler = CustomScaler(columns_to_scale)

In [25]:
wkday_scaler.fit(unscaled_inputs)

CustomScaler(columns=['RdWd', 'SegNum', 'StLength'], copy=None,
       with_mean=None, with_std=None)

In [26]:
scaled_inputs = wkday_scaler.transform(unscaled_inputs)

In [27]:
scaled_inputs

Unnamed: 0,LdUse,Tdir,RdWd,BikeInfra,SegNum,StLength
0,1,0,-0.843007,0,-0.764842,-0.642388
1,2,0,-0.843007,0,-0.909298,-0.909192
2,5,0,-0.843007,0,-1.068200,-1.255046
3,4,1,0.364690,1,-0.057006,-0.096147
4,4,1,0.163407,0,0.896405,0.413935
5,2,1,1.169822,0,0.535265,0.143878
6,4,1,0.565973,0,0.824177,0.587694
7,4,1,-0.440442,3,2.326522,2.305466
8,4,1,0.364690,0,0.535265,0.143878
9,4,1,1.169822,0,0.896405,0.413935


In [28]:
scaled_inputs.shape

(93, 6)

## Spliting the data into train & test and shuffle

In [29]:
from sklearn.model_selection import train_test_split

### Split

In [30]:
train_test_split(scaled_inputs, targets)

[    LdUse  Tdir      RdWd  BikeInfra    SegNum  StLength
 17      3     0 -1.044290          0 -1.097091 -1.137567
 87      4     1  1.169822          0  0.535265  0.143878
 86      4     1  0.666614          0 -0.793733 -0.337231
 4       4     1  0.163407          0  0.896405  0.413935
 0       1     0 -0.843007          0 -0.764842 -0.642388
 56      4     0 -0.440442          0 -0.764842 -0.132608
 66      4     1 -0.843007          0  0.203015  0.452893
 24      4     1 -0.843007          0 -0.938189 -1.172364
 72      4     0 -2.050705          3 -0.519266 -0.589284
 67      4     1  1.169822          0 -0.808178 -1.069864
 78      4     0  1.572387          0  2.572098  1.736759
 74      4     0 -0.239159          0  3.424390  3.105575
 65      4     1 -0.843007          0 -0.663722 -0.570978
 82      1     0 -0.843007          0  0.029668  0.952762
 73      3     0 -0.239159          0  3.424390  3.105575
 30      1     1 -0.440442          0 -0.389255 -0.292751
 1       2    

In [31]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, #train_size = 0.8, 
                                                                            test_size = 0.2, random_state = 20)

In [32]:
print (x_train.shape, y_train.shape)

(74, 6) (74,)


In [33]:
print (x_test.shape, y_test.shape)

(19, 6) (19,)


## Logistic regression with sklearn

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training the model

In [35]:
reg = LogisticRegression()

In [36]:
reg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [37]:
reg.score(x_train,y_train)

0.7837837837837838

### Finding the intercept and coefficients

In [38]:
reg.intercept_

array([-0.79005379])

In [39]:
reg.coef_

array([[0.1342706 , 0.17697215, 0.60958173, 0.62840095, 0.15518114,
        0.36107829]])

In [40]:
unscaled_inputs.columns.values

array(['LdUse', 'Tdir', 'RdWd', 'BikeInfra', 'SegNum', 'StLength'],
      dtype=object)

In [41]:
feature_name = unscaled_inputs.columns.values

In [42]:
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,Coefficient
0,LdUse,0.134271
1,Tdir,0.176972
2,RdWd,0.609582
3,BikeInfra,0.628401
4,SegNum,0.155181
5,StLength,0.361078


In [43]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.790054
1,LdUse,0.134271
2,Tdir,0.176972
3,RdWd,0.609582
4,BikeInfra,0.628401
5,SegNum,0.155181
6,StLength,0.361078


## Interpreting the coefficients

In [44]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [45]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-0.790054,0.45382
1,LdUse,0.134271,1.143702
2,Tdir,0.176972,1.193598
3,RdWd,0.609582,1.839662
4,BikeInfra,0.628401,1.874611
5,SegNum,0.155181,1.167869
6,StLength,0.361078,1.434876


In [46]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
4,BikeInfra,0.628401,1.874611
3,RdWd,0.609582,1.839662
6,StLength,0.361078,1.434876
2,Tdir,0.176972,1.193598
5,SegNum,0.155181,1.167869
1,LdUse,0.134271,1.143702
0,Intercept,-0.790054,0.45382


## Testing the model

In [47]:
reg.score(x_test,y_test)

0.631578947368421

In [48]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.06760488, 0.93239512],
       [0.24606179, 0.75393821],
       [0.77756234, 0.22243766],
       [0.77071772, 0.22928228],
       [0.55270035, 0.44729965],
       [0.7357985 , 0.2642015 ],
       [0.72963142, 0.27036858],
       [0.79601341, 0.20398659],
       [0.82058359, 0.17941641],
       [0.670188  , 0.329812  ],
       [0.3768892 , 0.6231108 ],
       [0.3259397 , 0.6740603 ],
       [0.50425796, 0.49574204],
       [0.13675703, 0.86324297],
       [0.37670073, 0.62329927],
       [0.31602135, 0.68397865],
       [0.74339605, 0.25660395],
       [0.77213652, 0.22786348],
       [0.76357902, 0.23642098]])

In [49]:
predicted_proba.shape

(19, 2)

In [50]:
predicted_proba[:,1]

array([0.93239512, 0.75393821, 0.22243766, 0.22928228, 0.44729965,
       0.2642015 , 0.27036858, 0.20398659, 0.17941641, 0.329812  ,
       0.6231108 , 0.6740603 , 0.49574204, 0.86324297, 0.62329927,
       0.68397865, 0.25660395, 0.22786348, 0.23642098])

In [51]:
summary_table1 = pd.DataFrame (columns=['Feature name'], data = feature_name)

summary_table['Coefficient'] = np.transpose()

summary_table

TypeError: transpose() missing 1 required positional argument: 'a'

## Save the model

In [None]:
import pickle

In [None]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [None]:
with open('scaler','wb') as file:
    pickle.dump(wkday_scaler, file)