In [12]:
import sys
import pandas as pd
sys.path.append("C:/Users/ping/MyDrive/py_files/python/py379/")
from myUtils import pickle_load, pickle_dump

pd.set_option('max_colwidth', 12, 'display.max_columns', 18, 'display.width', 1200, 'display.max_rows',100)
INPUT_DIR = 'C:/Users/ping/OneDrive/Documents/jenn_bb_sales'
path_pickle_dump = f'{INPUT_DIR}/'

In [13]:
# read Square csv and pickle df
df1 = pd.read_csv(f'{INPUT_DIR}/items-2021-02-01-2021-10-02.csv')
df2 = pd.read_csv(f'{INPUT_DIR}/items-2021-09-06-2022-09-07.csv')
df = pd.concat([df1, df2])
df = df.drop_duplicates()  # drop overlap between the 2 csv files
df = df.sort_values(by=['Date', 'Time'])
df = df.reset_index(drop=True)  # create new index
pickle_dump(df, path_pickle_dump, 'df_sq_download')
print(f'df.shape: {df.shape}')

df.shape: (18366, 27)


In [14]:
# clean data
df = pickle_load(path_pickle_dump, 'df_sq_download')
df_shape_before_clean = df.shape
# select only these columns
df = df[['Date', 'Time', 'Category', 'Item', 'Qty',
       'Price Point Name', 'Gross Sales',
       'Discounts', 'Net Sales', 'Tax', 
       'Device Name', 'Notes', 'Event Type', 
       'Dining Option', 'Customer Name']]
# strip leading $, convert from object to float
cols_to_clean = ['Gross Sales', 'Discounts', 'Net Sales', 'Tax']
for col in cols_to_clean:
  df[col] = df[col].str.split('$').str[-1]  # strip leading $
  df[col] = pd.to_numeric(df[col])  # convert from object to float
df = df.set_index('Date')  # set index to Date column
df.index = pd.to_datetime(df.index)  # convert Date string to datetime 
df.Time = pd.to_datetime(df.Time, format= '%H:%M:%S').dt.time  # convert Time string to hour:minute:second
# remove whole sale customer "Canyon Coffee" "Canyon Coffee"
print(f'df.shape before removing whole sale customer "Canyon Coffee": {df.shape}')
df = df[df['Customer Name'] != "Canyon Coffee"]
print(f'df.shape after removing whole sale customer "Canyon Coffee":  {df.shape}')
df_shape_after_clean = df.shape
pickle_dump(df, path_pickle_dump, 'df_clean')
print(f'df.shape before clean:  {df_shape_before_clean}')
print(f'df.shape after clean:   {df_shape_after_clean}')

df.shape before removing whole sale customer "Canyon Coffee": (18366, 14)
df.shape after removing whole sale customer "Canyon Coffee":  (18364, 14)
df.shape before clean:  (18366, 27)
df.shape after clean:   (18364, 14)


In [15]:
df = pickle_load(path_pickle_dump, 'df_clean')
print(f'df.shape: {df.shape}')

df.shape: (18364, 14)


In [16]:
print(f'Time of fist row, hour: {df.head(1).Time[0].hour}, minute: {df.head(1).Time[0].minute}, second: {df.head(1).Time[0].second}')

Time of fist row, hour: 17, minute: 31, second: 7


In [17]:
# sum daily sales into a series
gross = df.groupby('Date')['Gross Sales'].sum()
# convert series into dataframe
df_gross = pd.DataFrame(gross)
df_gross.index = pd.to_datetime(df_gross.index)  # change index to datetime before concat

In [18]:
df_gross

Unnamed: 0_level_0,Gross Sales
Date,Unnamed: 1_level_1
2021-07-08,30.0
2021-07-09,859.0
2021-07-10,1205.0
2021-07-16,847.0
2021-07-17,1026.0
...,...
2022-08-21,935.5
2022-09-01,64.0
2022-09-02,400.0
2022-09-03,589.0


In [19]:
# Los Angeles temperature and precipitation data
# https://www.ncei.noaa.gov/cdo-web/datatools/findstation

df_weather = pd.read_csv(f'{INPUT_DIR}/la_weather_2021-01-01_2022-09-28.csv')
df_weather = df_weather.set_index("DATE")
df_weather.index = pd.to_datetime(df_weather.index)  # change index to datetime before concat

In [20]:
# concatenate columns of df_gross and df_weather using inner join
df_grSales_weather = pd.concat([df_gross, df_weather], axis=1, join='inner')
df_grSales_weather['DOW'] = \
  df_grSales_weather['DOW'].replace(to_replace=['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'], value=[0, 1, 2, 3, 4, 5, 6])
pickle_dump(df_grSales_weather, path_pickle_dump, 'df_grSales_weather')

In [21]:
df_grSales_weather = pickle_load(path_pickle_dump, 'df_grSales_weather')
df_grSales_weather

Unnamed: 0,Gross Sales,STATION,NAME,AWND,PGTM,PRCP,TAVG,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,WT01,WT02,WT08,DOW
2021-07-08,30.0,USW00093134,LOS ANGE...,1.34,,0.0,,84,68,270.0,270.0,8.1,12.1,,,,4
2021-07-09,859.0,USW00093134,LOS ANGE...,1.12,,0.0,,86,68,270.0,270.0,8.9,14.1,,,,5
2021-07-10,1205.0,USW00093134,LOS ANGE...,1.57,,0.0,,85,69,270.0,270.0,8.1,14.1,,,,6
2021-07-16,847.0,USW00093134,LOS ANGE...,1.79,,0.0,,82,67,270.0,260.0,8.1,15.0,,,1.0,5
2021-07-17,1026.0,USW00093134,LOS ANGE...,2.01,,0.0,,85,66,260.0,260.0,8.9,15.0,,,,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-08-21,935.5,USW00093134,LOS ANGE...,1.12,,0.0,,80,65,270.0,260.0,8.1,14.1,,,1.0,0
2022-09-01,64.0,USW00093134,LOS ANGE...,1.34,,0.0,,96,74,280.0,270.0,6.9,11.0,,,1.0,4
2022-09-02,400.0,USW00093134,LOS ANGE...,1.34,,0.0,,96,72,270.0,260.0,8.1,13.0,,,,5
2022-09-03,589.0,USW00093134,LOS ANGE...,1.12,,0.0,,98,75,280.0,280.0,8.9,14.1,,,,6
