# Goal: Take Reagan's linear regression code and apply it to the RSS data 

## Process:
Reagan's linear regression was performed on RGG data. In order to apply his code to the RSS data,  I had to clean the RSS data in the following ways:
- 1) Rename some of the columns to match naming conventions in the RGG data 
- 2) Convert the Time column to floats (the original data is classified as the object type because of a few weird values)

## Output: 
This notebook will output a csv file that shows various bin combinations and the estimated time to service that combination 


In [None]:
import warnings
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)
import random


In [None]:
! pip install openpyxl

You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.[0m


# Read Data

In [None]:
df = pd.read_excel('RSS Master Data File.xlsx',engine='openpyxl', parse_dates=True)
df.columns

Index(['Date', 'Day', 'Route', 'Truck #', 'Vehicle Type', 'Commodity',
       'Tipper', 'Sequence #', 'Address #', 'Apt.#', 'Street', 'Even/Odd',
       'Meandor', 'I or C?', 'Time', 'Block Time', '#Units', 'Number of Stops',
       '16 gal', '20 gal', '32 gal', '64 gal', '96 gal', 'CCAN', '1 yd',
       '1.5 yd', '2 yd', '3 yd', '4 yd ', '5 yd ', '6 yd ', 'Cardboard Box',
       'Trash Bags', 'Hill or Flat?', 'Street Sweeping', 'Locked',
       'Common Notes', 'Additional Notes', 'GlobalID', 'x', 'y'],
      dtype='object')

In [None]:
dfr = pd.read_excel('RGG Master Data Collection All.xlsx', engine='openpyxl',parse_dates=True)
dfr.columns

Index([           'Date',             'Day',           'Route',
              'Truck # ',  'Vehicle Type  ',       'Commodity',
                'Tipper',      'Sequence #',       'Address #',
                 'Apt.#',          'Street',        'Even/Odd',
              'Meandor ',     'Inside/Curb',       'Time(Sec)',
             'Blocktime',          '#Units', 'Number of Stops',
                      16,                20,                32,
                      64,                96,            'CCAN',
                 '1 yrd',         '1.5 yrd',           '2 yrd',
                 '3 yrd',           '4 yrd',           '5 yrd',
                 '6 yrd',   'Cardboard Box',      'Trash Bags',
          'Total Volume',   'Neighborhood ',       'Hill/Flat',
       'Street Sweepng ',           'Notes',        'GlobalID',
                     'x',               'y',  'Data Collector'],
      dtype='object')

# Change Column Names to match Reagan's Column Names

In [None]:
df.rename(columns = {'I or C?':'Inside/Curb'}, inplace = True)
df.rename(columns = {'16 gal':'16', '20 gal':'20','32 gal':'32','64 gal':'64','96 gal':'96'},inplace = True)
df.rename(columns = {'1 yd':'1yd', '1.5 yd':'1.5yd','2 yd':'2yd','3 yd':'3yd','4 yd ':'4yd', '5 yd ':'5yd', '6 yd ':'6yd'},inplace = True)
df.rename(columns = {'Time':'Time(Sec)'}, inplace = True)
df.columns

Index(['Date', 'Day', 'Route', 'Truck #', 'Vehicle Type', 'Commodity',
       'Tipper', 'Sequence #', 'Address #', 'Apt.#', 'Street', 'Even/Odd',
       'Meandor', 'Inside/Curb', 'Time(Sec)', 'Block Time', '#Units',
       'Number of Stops', '16', '20', '32', '64', '96', 'CCAN', '1yd', '1.5yd',
       '2yd', '3yd', '4yd', '5yd', '6yd', 'Cardboard Box', 'Trash Bags',
       'Hill or Flat?', 'Street Sweeping', 'Locked', 'Common Notes',
       'Additional Notes', 'GlobalID', 'x', 'y'],
      dtype='object')

# Clean the RSS data

In [None]:
#df.info() # Time(Sec) is an object 
#dfr.info() # Time(Sec) is float 

# Why is Time(Sec) an object in the RSS dataframe?
df['Time(Sec)'].unique()


array([118.,  59.,  86.,  41.,  31.,  35.,  50.,  47.,  26.,  82., 112.,
        77.,  97., 149., 130., 116.,  54.,  73., 121., 128., 117.,  91.,
       323.,  53., 102.,  66., 123.,  56., 105.,  45.,  34.,  62.,  39.,
        72.,  22.,  92.,  49.,  44., 110.,  52.,  64.,  24.,  21.,  25.,
        51.,  20., 120., 189.,  46.,  27., 188.,  71.,  83.,  74., 127.,
       213., 137.,  36.,  55.,  33., 133., 138.,  98.,  40., 181.,  80.,
        76.,  32.,  38.,  48.,  68.,  23.,  57.,  88.,  42.,  75.,  29.,
        84.,  96., 168.,  28.,  61.,  37., 136.,  63.,  69.,  30.,  58.,
        87., 107., 166.,  nan,  65.,  89., 140., 141.,  43., 195.,  95.,
        90., 164., 296.,  60., 101., 122., 277.,  93., 142., 113., 150.,
       124., 152., 160., 169., 129., 230., 114., 162., 151., 100.,  19.,
       196.,  18., 250., 131.,  67.,  81., 125.,  12., 203.,  17., 104.,
       305.,  85., 109., 135., 363.,  78., 211., 200., 199.,  94., 173.,
       139., 111.,  70.,  79.,  15., 143., 103., 17

In [None]:
sum(df['Time(Sec)'].isnull())

18

In [None]:
df[df['Time(Sec)']=='2\n49'] # Should this be 249 seconds or 2 min 49 sec (169 sec)

Unnamed: 0,Date,Day,Route,Truck #,Vehicle Type,Commodity,Tipper,Sequence #,Address #,Apt.#,Street,Even/Odd,Meandor,Inside/Curb,Time(Sec),Block Time,#Units,Number of Stops,16,20,32,64,96,CCAN,1yd,1.5yd,2yd,3yd,4yd,5yd,6yd,Cardboard Box,Trash Bags,Hill or Flat?,Street Sweeping,Locked,Common Notes,Additional Notes,GlobalID,x,y


In [None]:
df['Time(Sec)'] = pd.to_numeric(df['Time(Sec)'], errors='coerce')

In [None]:
df['Time(Sec)'].unique()

array([118.,  59.,  86.,  41.,  31.,  35.,  50.,  47.,  26.,  82., 112.,
        77.,  97., 149., 130., 116.,  54.,  73., 121., 128., 117.,  91.,
       323.,  53., 102.,  66., 123.,  56., 105.,  45.,  34.,  62.,  39.,
        72.,  22.,  92.,  49.,  44., 110.,  52.,  64.,  24.,  21.,  25.,
        51.,  20., 120., 189.,  46.,  27., 188.,  71.,  83.,  74., 127.,
       213., 137.,  36.,  55.,  33., 133., 138.,  98.,  40., 181.,  80.,
        76.,  32.,  38.,  48.,  68.,  23.,  57.,  88.,  42.,  75.,  29.,
        84.,  96., 168.,  28.,  61.,  37., 136.,  63.,  69.,  30.,  58.,
        87., 107., 166.,  nan,  65.,  89., 140., 141.,  43., 195.,  95.,
        90., 164., 296.,  60., 101., 122., 277.,  93., 142., 113., 150.,
       124., 152., 160., 169., 129., 230., 114., 162., 151., 100.,  19.,
       196.,  18., 250., 131.,  67.,  81., 125.,  12., 203.,  17., 104.,
       305.,  85., 109., 135., 363.,  78., 211., 200., 199.,  94., 173.,
       139., 111.,  70.,  79.,  15., 143., 103., 17

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1256 entries, 0 to 1255
Data columns (total 41 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Date              1256 non-null   object 
 1   Day               1256 non-null   int64  
 2   Route             1256 non-null   int64  
 3   Truck #           1256 non-null   int64  
 4   Vehicle Type      1256 non-null   object 
 5   Commodity         1256 non-null   object 
 6   Tipper            1256 non-null   int64  
 7   Sequence #        1256 non-null   int64  
 8   Address #         1225 non-null   object 
 9   Apt.#             3 non-null      object 
 10  Street            1256 non-null   object 
 11  Even/Odd          12 non-null     object 
 12  Meandor           3 non-null      object 
 13  Inside/Curb       1256 non-null   object 
 14  Time(Sec)         1238 non-null   float64
 15  Block Time        15 non-null     float64
 16  #Units            1240 non-null   float64


In [None]:
df['Inside/Curb'].unique()

array(['C', 'I', 'IC', 'CL'], dtype=object)

In [None]:
# What is CL? 

#df[df['Inside/Curb']== 'CL'] # Happens 13 times 

# Run Reagan's Linear Regression Function

In [None]:
def linear(df):
    df_group = df.groupby(['Inside/Curb', '16', '20', '32', '64', '96', '1yd', '1.5yd', '2yd', '3yd', '4yd', '5yd', '6yd'])[
        'Time(Sec)'].agg(['mean','count']).reset_index()
    large_count = 0
    df_group = df_group.sort_values(['Inside/Curb'])
    df_group = df_group.rename(columns={'mean': 'Time(Sec)'})
    vol_list = ['16', '20', '32', '64', '96', '1yd', '1.5yd', '2yd', '3yd', '4yd', '5yd', '6yd']
    for vol in vol_list:
        df_group[vol] = pd.to_numeric(df_group[vol], errors='coerce')
    df_group = df_group.dropna()
#     df_group_small = df_group[df_group['yard'] == 'N']
#     df_group_large = df_group[df_group['yard'] == 'Y']
    df_c = df_group[(df_group['Inside/Curb'] == 'C')]
    df_i = df_group[(df_group['Inside/Curb'] == 'I')]
    df_ic = df_group[(df_group['Inside/Curb'] == 'IC')]
    df_c_large = df_group[(df_group['Inside/Curb'] == 'C') & (df_group['count'] >= large_count)]
    df_i_large = df_group[(df_group['Inside/Curb'] == 'I') & (df_group['count'] >= large_count)]
    x_list = ['16', '20', '32', '64', '96', '1yd', '1.5yd', '2yd', '3yd', '4yd', '5yd', '6yd']
    X1, y1 = df_c_large[x_list], df_c_large['Time(Sec)']
    X2, y2 = df_i_large[x_list], df_i_large['Time(Sec)']
    
    #Random sampling of the data
    
    df_c_train=df_c_large.sample(frac=0.80,random_state=2)
    X1_train,Y1_train=df_c_train[x_list], df_c_train['Time(Sec)']
    df_c_test=df_c_large.drop(df_c_train.index)
    X1_test,Y1_test=df_c_test[x_list], df_c_test['Time(Sec)']

    df_i_train=df_i_large.sample(frac=0.80,random_state=2)
    X2_train,Y2_train=df_i_train[x_list], df_i_train['Time(Sec)']
    df_i_test=df_i_large.drop(df_i_train.index)
    X2_test,Y2_test=df_i_test[x_list], df_i_test['Time(Sec)']


    # For training dataset
    reg1 = LinearRegression().fit(X1, y1)
    reg2 = LinearRegression().fit(X2, y2)
    reg1_pred = reg1.predict(df_c[x_list])
    reg2_pred = reg2.predict(df_i[x_list])

    #For train-test split
    reg1_train=LinearRegression().fit(X1_train, Y1_train)
    reg2_train = LinearRegression().fit(X2_train, Y2_train)

    reg1_pred_test=reg1_train.predict(X1_test)
    reg2_pred_test=reg2_train.predict(X2_test)

    df_c['pred'] = reg1_pred
    df_i['pred'] = reg2_pred
    df_c.loc[df_c['count'] < large_count, 'Time(Sec)'] = df_c['pred']
    df_i.loc[df_i['count'] < large_count, 'Time(Sec)'] = df_i['pred']
    df_all = pd.concat([df_c, df_i])
    df_all = pd.concat([df_all, df_ic])
    df_all = df_all[['Inside/Curb'] + vol_list + ['Time(Sec)'] + ['pred']]
    vol_list = ['16', '20', '32', '64', '96', '1yd', '1.5yd', '2yd', '3yd', '4yd', '5yd', '6yd']
    df_all['sum'] = df_all[vol_list].sum(axis=1)
    df_all = df_all[df_all['sum'] > 0]
    return df_c,df_c_large,df_all,reg1_pred,reg2_pred,y1,y2,Y1_test,Y2_test,reg1_pred_test,reg2_pred_test

In [None]:
df_c,df_c_large,df_all,reg1_pred,reg2_pred,y1,y2,Y1_test,Y2_test,reg1_pred_test,reg2_pred_test = linear(df)

In [None]:
len(df_c)

117

In [None]:
len(df_c_large)

117

In [None]:
regression.to_excel('RSS_regression_table.xlsx', index=False)

In [None]:
## 

## Checking model performance

In [None]:
rmse=np.sqrt(np.average((reg2_pred-y2)**2))
print("Root mean square error {:.2f}".format(rmse))

In [None]:
# Model that predicts inside times 

mape=np.average(np.abs((reg2_pred-y2))/y2)
accuracy=(1-mape)*100
print("Accuracy of the model {:.2f}".format(accuracy))

Accuracy of the model 81.23


In [None]:
final = pd.DataFrame({'Actual':y2, 'Prediction':reg2_pred})
final

Unnamed: 0,Actual,Prediction
138,158.0,156.362087
149,112.333333,126.500724
148,170.0,154.218093
150,114.0,169.160014
147,153.727273,108.840341
146,92.5,83.841434
145,273.0,208.830167
144,106.0,148.510494
143,98.0,123.511587
142,157.5,155.849018


In [None]:
# Model that predicts curb times 

mape_2=np.average(np.abs((reg1_pred-y1))/y1)
accuracy_2=(1-mape_2)*100
print("Accuracy of the model {:.2f}".format(accuracy_2))

Accuracy of the model 74.52


In [None]:
final = pd.DataFrame({'Actual':y1, 'Prediction':reg1_pred})
final

Unnamed: 0,Actual,Prediction
0,39.035714,8.221372
87,76.0,109.736411
86,109.0,97.79966
85,87.666667,73.926159
84,118.0,102.105713
83,65.0,90.168963
82,49.0,78.232212
81,122.0,66.295462
79,79.5,94.475016
78,69.0,82.538265


In [None]:
## Out of sample testing

sample_df_I=pd.DataFrame({'Actual_I':Y1_test,'Predic_I':reg1_pred_test})
mape_I=np.average(np.abs((reg1_pred_test-Y1_test))/Y1_test)
accuracy_I=(1-mape_I)*100
print("Accuracy of the model {:.2f}".format(accuracy_I))

Accuracy of the model 76.62


In [None]:
sample_df_C=pd.DataFrame({'Actual_C':Y2_test,'Predic_C':reg2_pred_test})
mape_C=np.average(np.abs((reg2_pred_test-Y2_test))/Y2_test)
accuracy_C=(1-mape_C)*100
print("Accuracy of the model {:.2f}".format(accuracy_C))

Accuracy of the model 79.43


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=938c6ad9-491d-4307-bf8a-c751a244ce4f' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>