# Purpose: This notebook creates RSS_RGG_final.xlsx 

In [None]:
# Start writing code here...
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)

In [None]:
df_RSS_collated = pd.read_excel('data/RSS_collated.xlsx',engine='openpyxl', parse_dates=True)
df_RGG_collated=pd.read_excel('data/RGG Master Data Collection All.xlsx',engine='openpyxl', parse_dates=True)

In [None]:
df_RSS_collated.columns

Index(['Commodity', 'Vehicle Type', 'Inside/Curb', '16 gal', '20 gal',
       '32 gal', '64 gal', '96 gal', 'Neighborhood', 'Meandor', 'Route',
       'Tipper', 'Key Code?', 'Day', '#Units', 'Time', 'Address #', 'Street'],
      dtype='object')

In [None]:
df_RSS_collated['1 yrd']=0
df_RSS_collated['1.5 yrd']=0
df_RSS_collated['2 yrd']=0
df_RSS_collated['3 yrd']=0
df_RSS_collated['4 yrd']=0
df_RSS_collated['5 yrd']=0
df_RSS_collated['6 yrd']=0

In [None]:
# Making sure the columns match 
df_RGG_collated.columns
df_RGG_collated.rename(columns = {'Vehicle Type  ':'Vehicle Type', \
                         'Neighborhood ' : 'Neighborhood', \
                         'Meandor ': 'Meandor', \
                         'Time(Sec)':'Time', \
                          16:'16 gal',\
                          20:'20 gal',\
                          32:'32 gal',\
                          64:'64 gal',\
                          96:'96 gal'}, inplace=True)

In [None]:
df_RGG_collated.columns

Index(['Date', 'Day', 'Route', 'Truck # ', 'Vehicle Type', 'Commodity',
       'Tipper', 'Sequence #', 'Address #', 'Apt.#', 'Street', 'Even/Odd',
       'Meandor', 'Inside/Curb', 'Time', 'Blocktime', '#Units',
       'Number of Stops', '16 gal', '20 gal', '32 gal', '64 gal', '96 gal',
       'CCAN', '1 yrd', '1.5 yrd', '2 yrd', '3 yrd', '4 yrd', '5 yrd', '6 yrd',
       'Cardboard Box', 'Trash Bags', 'Total Volume', 'Neighborhood',
       'Hill/Flat', 'Street Sweepng ', 'Notes', 'GlobalID', 'x', 'y',
       'Data Collector'],
      dtype='object')

In [None]:
# Creating a key column. since we do not see key column we will assume there were no keys 
df_RGG_collated['Key Code?']=0

### Appending RSS and RGG Data into a data frame (df_RGG_RSS)

In [None]:
df_RGG_collated=df_RGG_collated[['Address #', 'Street', 'Commodity', 'Vehicle Type', 'Inside/Curb', '16 gal', '20 gal',
       '32 gal', '64 gal', '96 gal',  '1 yrd', '1.5 yrd', '2 yrd', '3 yrd', '4 yrd', '5 yrd', '6 yrd', 'Neighborhood', 'Meandor', 'Route',
       'Tipper', 'Key Code?', 'Day', '#Units', 'Time']]

In [None]:
df_RGG_collated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10573 entries, 0 to 10572
Data columns (total 25 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Address #     10283 non-null  object 
 1   Street        10487 non-null  object 
 2   Commodity     10573 non-null  object 
 3   Vehicle Type  10118 non-null  object 
 4   Inside/Curb   10273 non-null  object 
 5   16 gal        10572 non-null  float64
 6   20 gal        10573 non-null  int64  
 7   32 gal        10573 non-null  object 
 8   64 gal        10572 non-null  float64
 9   96 gal        10573 non-null  int64  
 10  1 yrd         10573 non-null  object 
 11  1.5 yrd       10573 non-null  int64  
 12  2 yrd         10573 non-null  int64  
 13  3 yrd         10573 non-null  int64  
 14  4 yrd         10573 non-null  int64  
 15  5 yrd         10573 non-null  int64  
 16  6 yrd         10573 non-null  int64  
 17  Neighborhood  116 non-null    object 
 18  Meandor       5461 non-nul

In [None]:
# Final Dataframe
df_RSS_collated['Company']='RSS'
df_RGG_collated['Company']='RGG'
df_RGG_RSS=pd.concat([df_RSS_collated,df_RGG_collated])

df_RGG_RSS['16 gal'] = df_RGG_RSS['16 gal'].fillna(0)
df_RGG_RSS['20 gal'] = df_RGG_RSS['20 gal'].fillna(0)
df_RGG_RSS['32 gal'] = df_RGG_RSS['32 gal'].fillna(0)
df_RGG_RSS['64 gal'] = df_RGG_RSS['64 gal'].fillna(0)
df_RGG_RSS['96 gal'] = df_RGG_RSS['96 gal'].fillna(0)

df_RGG_RSS['Time']=pd.to_numeric(df_RGG_RSS['Time'], errors='coerce')
df_RGG_RSS['Meandor']=df_RGG_RSS['Meandor'].replace('N','No').replace('Y','Yes')
df_RGG_RSS['32 gal']=pd.to_numeric(df_RGG_RSS['32 gal'], errors='coerce')
df_RGG_RSS['32 gal'] = df_RGG_RSS['32 gal'].fillna(0)
df_RGG_RSS['#Units']=df_RGG_RSS['16 gal']+df_RGG_RSS['20 gal']+df_RGG_RSS['32 gal']+df_RGG_RSS['64 gal']+df_RGG_RSS['96 gal']
df_RGG_RSS['Commodity']=df_RGG_RSS['Commodity'].replace('Recycle ','Recycle').replace('R','Recycle').replace('G', 'Garbage').replace('GB', 'Garbage')
df_RGG_RSS['Inside/Curb'] = df_RGG_RSS['Inside/Curb'].str.strip().replace('c', 'C').replace('i', 'I').replace('ic', 'IC').replace('',np.nan)
df_RGG_RSS['Vehicle Type'] = df_RGG_RSS['Vehicle Type'].str.strip().replace('HIEL', 'HEIL')
df_RGG_RSS['Tipper']=pd.to_numeric(df_RGG_RSS['Tipper'],errors='ignore')


In [None]:
df_RGG_RSS.head()

Unnamed: 0,Commodity,Vehicle Type,Inside/Curb,16 gal,20 gal,32 gal,64 gal,96 gal,Neighborhood,Meandor,Route,Tipper,Key Code?,Day,#Units,Time,Address #,Street,1 yrd,1.5 yrd,2 yrd,3 yrd,4 yrd,5 yrd,6 yrd,Company
0,Recycle,S-HEIL,C,0.0,0,1.0,2.0,2,0,,912,2,,2,5.0,118.0,5128/5132,Geary St,0,0,0,0,0,0,0,RSS
1,Recycle,S-HEIL,C,0.0,0,0.0,0.0,1,0,,912,2,,2,1.0,59.0,5620,Geary St,0,0,0,0,0,0,0,RSS
2,Recycle,S-HEIL,I,0.0,0,0.0,0.0,1,0,,912,2,,2,1.0,86.0,1947,Clement St,0,0,0,0,0,0,0,RSS
3,Recycle,S-HEIL,C,0.0,0,1.0,0.0,1,0,,912,2,,2,2.0,41.0,1919,Clement St,0,0,0,0,0,0,0,RSS
4,Recycle,S-HEIL,C,0.0,0,1.0,0.0,0,0,,912,2,,2,1.0,31.0,1909,Clement St,0,0,0,0,0,0,0,RSS


# Rows to delete

In [None]:
# 16 gal compost bins --> according to Aijaz, these do not exist 
# They were also responsible for inflating the RGG service time estimates for 16gal bins in the comparison done on 1.27
df_RGG_RSS_final = df_RGG_RSS.drop(df_RGG_RSS.loc[(df_RGG_RSS['16 gal']==1) &(df_RGG_RSS['Commodity']=='Compost')].index)

# If there are no units, there should be no time and the data is useless to us
df_RGG_RSS_final = df_RGG_RSS.drop(df_RGG_RSS.loc[(df_RGG_RSS['#Units']==0)].index)

df_RGG_RSS_final.dropna(subset=['Time'],inplace=True)
len(df_RGG_RSS_final)

12826

# Outliers to Flag 

In [None]:
# Create an outlier column 
df_RGG_RSS_final['Outlier']=0

df_RGG_RSS_final['Outlier'].loc[df_RGG_RSS_final['Time']>500]=1

# Join Truck Data 

In [None]:
df_RSF_Trucks = pd.read_excel('data/RSF_Trucks.xlsx')

# Strip whitespace 
df_RSF_Trucks.rename(columns=lambda x: x.strip(), inplace=True)
df_RSF_Trucks.columns


Index(['ID', 'Route', 'Company', 'Truck', 'Vehicle Type', 'Start Time',
       'Person', 'Commodity', 'Type', 'Capacity'],
      dtype='object')

In [None]:
# Create a subset that only includes route and truck info 
df_trucks = df_RSF_Trucks[['Route','Truck']]


In [None]:
df_RGG_RSS_final = df_RGG_RSS_final.merge(df_trucks, on='Route',how='left')


# Join Neighborhood Data 

In [None]:
# Drop the original neighborhood columns
df_RGG_RSS_final.drop(columns='Neighborhood', inplace=True)

In [None]:
df_Route_Neighborhood = pd.read_excel('/work/data/Route_Neighborhood_1-1_Final-2.xlsx')
df_Route_Neighborhood['Neighborhood'] = df_Route_Neighborhood['Neighborhood'].str.strip()
df_Route_Neighborhood['Neighborhood'].value_counts()

Pacific Heights       12
Mission               10
Financial District    10
Western Addition      10
Marina                 8
Inner Richmond         8
Outer Richmond         7
Russian Hill           7
SOMA                   7
North Beach            7
Haight Ashbury         6
Excelsior              6
Outer Sunset           6
Inner Sunset           6
Downtown               5
Castro                 5
Noe Valley             5
Ocean View             5
Parkside               4
West Twin Peaks        4
Bernal Heights         4
Bayview                3
Potrero Hill           3
Outer Mission          3
Presidio Heights       3
Nob Hill               2
Visitation Valley      2
Chinatown              2
Lakeshore              2
Glen Park              2
Presidio               1
Seacliff               1
Twin Peaks             1
Tresure Island         1
Downton                1
Visitacion Valley      1
Name: Neighborhood, dtype: int64

## Fix Downton and Visitacion Valley spelling errors

In [None]:
df_Route_Neighborhood = df_Route_Neighborhood.replace({'Downton':'Downtown', 'Visitacion Valley':'Visitation Valley'})
sorted(df_Route_Neighborhood['Neighborhood'].unique())

['Bayview',
 'Bernal Heights',
 'Castro',
 'Chinatown',
 'Downtown',
 'Excelsior',
 'Financial District',
 'Glen Park',
 'Haight Ashbury',
 'Inner Richmond',
 'Inner Sunset',
 'Lakeshore',
 'Marina',
 'Mission',
 'Nob Hill',
 'Noe Valley',
 'North Beach',
 'Ocean View',
 'Outer Mission',
 'Outer Richmond',
 'Outer Sunset',
 'Pacific Heights',
 'Parkside',
 'Potrero Hill',
 'Presidio',
 'Presidio Heights',
 'Russian Hill',
 'SOMA',
 'Seacliff',
 'Tresure Island',
 'Twin Peaks',
 'Visitation Valley',
 'West Twin Peaks',
 'Western Addition']

In [None]:
df_RGG_RSS_final = df_RGG_RSS_final.merge(df_Route_Neighborhood, on='Route',how='left')

In [None]:
df_RGG_RSS_final.loc[df_RGG_RSS_final['Route']==39]

Unnamed: 0,Commodity,Vehicle Type,Inside/Curb,16 gal,20 gal,32 gal,64 gal,96 gal,Meandor,Route,Tipper,Key Code?,Day,#Units,Time,Address #,Street,1 yrd,1.5 yrd,2 yrd,3 yrd,4 yrd,5 yrd,6 yrd,Company,Outlier,Truck,Neighborhood
1326,Garbage,na,C,0.0,0,7.0,1.0,0,,39,na,,2,8.0,143.0,na,,0,0,0,0,0,0,0,RSS,0,,Visitation Valley
1327,Garbage,na,C,0.0,0,6.0,0.0,0,,39,na,,2,6.0,96.0,na,,0,0,0,0,0,0,0,RSS,0,,Visitation Valley
1328,Garbage,na,C,0.0,0,6.0,0.0,0,,39,na,,2,6.0,108.0,na,,0,0,0,0,0,0,0,RSS,0,,Visitation Valley
1329,Garbage,na,C,1.0,0,5.0,0.0,0,,39,na,,2,6.0,100.0,na,,0,0,0,0,0,0,0,RSS,0,,Visitation Valley
1330,Garbage,na,C,0.0,0,4.0,2.0,0,,39,na,,2,6.0,100.0,na,,0,0,0,0,0,0,0,RSS,0,,Visitation Valley
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4211,Garbage,na,C,0.0,0,4.0,0.0,0,,39,2,,4,4.0,63.0,na,,0,0,0,0,0,0,0,RSS,0,,Visitation Valley
4212,Garbage,na,C,2.0,0,2.0,1.0,0,,39,2,,4,5.0,90.0,na,,0,0,0,0,0,0,0,RSS,0,,Visitation Valley
4213,Garbage,na,C,2.0,0,3.0,0.0,0,,39,2,,4,5.0,120.0,na,,0,0,0,0,0,0,0,RSS,0,,Visitation Valley
4214,Garbage,na,C,0.0,0,6.0,2.0,0,,39,2,,4,8.0,114.0,na,,0,0,0,0,0,0,0,RSS,0,,Visitation Valley


In [None]:
route_neighborhood = set(df_Route_Neighborhood['Neighborhood'].unique())

In [None]:
neighborhood_df_final = set(df_RGG_RSS_final['Neighborhood'].unique())


In [None]:
route_neighborhood.difference(neighborhood_df_final)

{'Bayview', 'Castro', 'Glen Park', 'Outer Mission', 'Seacliff', 'Twin Peaks'}

# Fix spelling error in Downtown Neighborhood

## Dealing with Address Data

In [None]:
import re

In [None]:
## Dealig with Address Dat
list_address=df_RGG_RSS_final['Address #'].astype(str)
final_num=[]
for address in list(list_address):
    if len(re.findall(r'\d+',address))==0:
        final_num.append(None)
    else:
        final_num.append(re.findall(r'\d+',address)[0])

df_RGG_RSS_final['Address_fixed']=final_num


In [None]:
df_RGG_RSS_final['Address_Street']=df_RGG_RSS_final['Address_fixed'].str.lower()+'_'+df_RGG_RSS_final['Street'].str.lower().str.strip('st').str.strip('ave').str.strip('pl').str.strip()

In [None]:
df_RGG_RSS_final.drop(columns=['Address_fixed','Address #','Street'],inplace=True)

In [None]:

df_RGG_RSS_final

Unnamed: 0,Commodity,Vehicle Type,Inside/Curb,16 gal,20 gal,32 gal,64 gal,96 gal,Meandor,Route,Tipper,Key Code?,Day,#Units,Time,1 yrd,1.5 yrd,2 yrd,3 yrd,4 yrd,5 yrd,6 yrd,Company,Outlier,Truck,Neighborhood,Address_Street
0,Recycle,S-HEIL,C,0.0,0,1.0,2.0,2,,912,2,,2,5.0,118.0,0,0,0,0,0,0,0,RSS,0,SL,Inner Richmond,5128_geary
1,Recycle,S-HEIL,C,0.0,0,0.0,0.0,1,,912,2,,2,1.0,59.0,0,0,0,0,0,0,0,RSS,0,SL,Inner Richmond,5620_geary
2,Recycle,S-HEIL,I,0.0,0,0.0,0.0,1,,912,2,,2,1.0,86.0,0,0,0,0,0,0,0,RSS,0,SL,Inner Richmond,1947_clement
3,Recycle,S-HEIL,C,0.0,0,1.0,0.0,1,,912,2,,2,2.0,41.0,0,0,0,0,0,0,0,RSS,0,SL,Inner Richmond,1919_clement
4,Recycle,S-HEIL,C,0.0,0,1.0,0.0,0,,912,2,,2,1.0,31.0,0,0,0,0,0,0,0,RSS,0,SL,Inner Richmond,1909_clement
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12821,Compost,R-HEIL,I,0.0,0,0.0,1.0,0,,302,1.0,0,4,1.0,98.0,0,0,0,0,0,0,0,RGG,0,RL,Chinatown,1220_jon
12822,Compost,R-HEIL,I,0.0,0,0.0,3.0,0,,302,1.0,0,4,3.0,100.0,0,0,0,0,0,0,0,RGG,0,RL,Chinatown,1221_jon
12823,Compost,R-HEIL,C,0.0,0,0.0,2.0,0,,302,1.0,0,4,2.0,66.0,0,0,0,0,0,0,0,RGG,0,RL,Chinatown,1000_california
12824,Compost,R-HEIL,IC,0.0,0,0.0,2.0,0,,302,1.0,0,4,2.0,118.0,0,0,0,0,0,0,0,RGG,0,RL,Chinatown,250_eavenworth


# Add key data 

In [None]:
df_rss_key = pd.read_excel('/work/data/Recology SF Access Accounts  1 26 2021.xlsx',sheet_name='RSS Access')
df_rgg_key = pd.read_excel('/work/data/Recology SF Access Accounts  1 26 2021.xlsx',sheet_name='RGG Access')


In [None]:
df_full_key=df_rgg_key.append(df_rss_key)

In [None]:
df_full_key['Address_Street'] = df_full_key["Addr"].astype(str)+'_'+df_full_key['Street'].str.lower().str.strip('st').str.strip('ave').str.strip('pl').str.strip()

In [None]:
df_new = df_full_key.groupby(['Address_Street','Code']).size().to_frame().reset_index()
df_new_key=df_new.groupby('Address_Street')['Address_Street', 'Code',0].max()

df_new_key=df_new_key.reset_index(drop=True)

df_final_merged=df_RGG_RSS_final.merge(df_new_key[['Address_Street','Code']],on='Address_Street',how='left')

In [None]:
df_final_merged = df_final_merged.drop(columns='Key Code?')
df_final_merged

Unnamed: 0,Commodity,Vehicle Type,Inside/Curb,16 gal,20 gal,32 gal,64 gal,96 gal,Meandor,Route,Tipper,Day,#Units,Time,1 yrd,1.5 yrd,2 yrd,3 yrd,4 yrd,5 yrd,6 yrd,Company,Outlier,Truck,Neighborhood,Address_Street,Code
0,Recycle,S-HEIL,C,0.0,0,1.0,2.0,2,,912,2,2,5.0,118.0,0,0,0,0,0,0,0,RSS,0,SL,Inner Richmond,5128_geary,
1,Recycle,S-HEIL,C,0.0,0,0.0,0.0,1,,912,2,2,1.0,59.0,0,0,0,0,0,0,0,RSS,0,SL,Inner Richmond,5620_geary,
2,Recycle,S-HEIL,I,0.0,0,0.0,0.0,1,,912,2,2,1.0,86.0,0,0,0,0,0,0,0,RSS,0,SL,Inner Richmond,1947_clement,
3,Recycle,S-HEIL,C,0.0,0,1.0,0.0,1,,912,2,2,2.0,41.0,0,0,0,0,0,0,0,RSS,0,SL,Inner Richmond,1919_clement,
4,Recycle,S-HEIL,C,0.0,0,1.0,0.0,0,,912,2,2,1.0,31.0,0,0,0,0,0,0,0,RSS,0,SL,Inner Richmond,1909_clement,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12821,Compost,R-HEIL,I,0.0,0,0.0,1.0,0,,302,1.0,4,1.0,98.0,0,0,0,0,0,0,0,RGG,0,RL,Chinatown,1220_jon,
12822,Compost,R-HEIL,I,0.0,0,0.0,3.0,0,,302,1.0,4,3.0,100.0,0,0,0,0,0,0,0,RGG,0,RL,Chinatown,1221_jon,
12823,Compost,R-HEIL,C,0.0,0,0.0,2.0,0,,302,1.0,4,2.0,66.0,0,0,0,0,0,0,0,RGG,0,RL,Chinatown,1000_california,
12824,Compost,R-HEIL,IC,0.0,0,0.0,2.0,0,,302,1.0,4,2.0,118.0,0,0,0,0,0,0,0,RGG,0,RL,Chinatown,250_eavenworth,


# Add new feature - Has Key
Does this stop have a key code or not? 

In [None]:
df_final_merged['Has Key'] = np.where(df_final_merged['Code'].isnull(), False, True)
df_final_merged

Unnamed: 0,Commodity,Vehicle Type,Inside/Curb,16 gal,20 gal,32 gal,64 gal,96 gal,Meandor,Route,Tipper,Day,#Units,Time,1 yrd,1.5 yrd,2 yrd,3 yrd,4 yrd,5 yrd,6 yrd,Company,Outlier,Truck,Neighborhood,Address_Street,Code,Has Key
0,Recycle,S-HEIL,C,0.0,0,1.0,2.0,2,,912,2,2,5.0,118.0,0,0,0,0,0,0,0,RSS,0,SL,Inner Richmond,5128_geary,,False
1,Recycle,S-HEIL,C,0.0,0,0.0,0.0,1,,912,2,2,1.0,59.0,0,0,0,0,0,0,0,RSS,0,SL,Inner Richmond,5620_geary,,False
2,Recycle,S-HEIL,I,0.0,0,0.0,0.0,1,,912,2,2,1.0,86.0,0,0,0,0,0,0,0,RSS,0,SL,Inner Richmond,1947_clement,,False
3,Recycle,S-HEIL,C,0.0,0,1.0,0.0,1,,912,2,2,2.0,41.0,0,0,0,0,0,0,0,RSS,0,SL,Inner Richmond,1919_clement,,False
4,Recycle,S-HEIL,C,0.0,0,1.0,0.0,0,,912,2,2,1.0,31.0,0,0,0,0,0,0,0,RSS,0,SL,Inner Richmond,1909_clement,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12821,Compost,R-HEIL,I,0.0,0,0.0,1.0,0,,302,1.0,4,1.0,98.0,0,0,0,0,0,0,0,RGG,0,RL,Chinatown,1220_jon,,False
12822,Compost,R-HEIL,I,0.0,0,0.0,3.0,0,,302,1.0,4,3.0,100.0,0,0,0,0,0,0,0,RGG,0,RL,Chinatown,1221_jon,,False
12823,Compost,R-HEIL,C,0.0,0,0.0,2.0,0,,302,1.0,4,2.0,66.0,0,0,0,0,0,0,0,RGG,0,RL,Chinatown,1000_california,,False
12824,Compost,R-HEIL,IC,0.0,0,0.0,2.0,0,,302,1.0,4,2.0,118.0,0,0,0,0,0,0,0,RGG,0,RL,Chinatown,250_eavenworth,,False


In [None]:
df_final_merged['Has Key'].value_counts()

False    10542
True      2284
Name: Has Key, dtype: int64

## Print Data 

In [None]:
df_final_merged.to_excel('RGG_RSS_final.xlsx',index=False)

In [None]:
df_final_merged['32 gal'].isnull().value_counts()

False    12826
Name: 32 gal, dtype: int64

In [None]:
training_neighborhoods = set(df_final_merged['Neighborhood'])

In [None]:
actual_neighborhoods = set(df_Route_Neighborhood['Neighborhood'])

In [None]:
afctual_neighborhoods.difference(training_neighborhoods)

NameError: name 'afctual_neighborhoods' is not defined

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=938c6ad9-491d-4307-bf8a-c751a244ce4f' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>