#### Weather Data Definition  
STATION = Weather station ID  
NAME = Weather station location  
Date = Date  
AWND = Average daily wind speed (meters per second or miles per hour as per user preference)  
PGTM = Peak gust time (hours and minutes, i.e., HHMM)  
PRCP = Precipitation (mm or inches as per user preference, inches to hundredths on Daily Form pdf file)  
TMAX = Maximum temperature, F  
TMIN = Minimum temperature, F  
WDF2 = Direction of fastest 2-minute wind (degrees)  
WDF5 = Direction of fastest 5-second wind (degrees)  
WSF2 = Fastest 2-minute wind speed (miles per hour or meters per second as per user preference)  
WSF5 = Fastest 5-second wind speed (miles per hour or meters per second as per user preference)  
WT01 = Fog, ice fog, or freezing fog (may include heavy fog)  
WT02 = Heavy fog or heaving freezing fog (not always distinguished from fog)  
WT08 = Smoke or haze  

In [5]:
import sys
import pandas as pd
sys.path.append("C:/Users/ping/MyDrive/py_files/python/py379/")
from myUtils import pickle_load, pickle_dump
pd.set_option('max_colwidth', 12, 'display.max_columns', 18, 'display.width', 1200, 'display.max_rows', 100)
INPUT_DIR = 'C:/Users/ping/OneDrive/Documents/jenn_bb_sales'
path_pickle_dump = f'{INPUT_DIR}/'

In [6]:
df = pickle_load(path_pickle_dump, 'df_grSale_weather_all')
print(f'df.shape: {df.shape}')

df.shape: (18366, 43)


In [7]:
cols_df = df.columns.tolist()
cols_grSale_keep = \
  ['Date', 'Time', 'Category', 'Item', 'Qty', 'Gross Sales', 'Discounts', 'Net Sales', 'Tax', 'Transaction ID', 'Event Type', 'Dining Option', 'Customer Name']
cols_weather_keep = ['NAME', 'AWND', 'PRCP', 'TAVG', 'TMAX', 'TMIN', 'WT01', 'WT02', 'WT08', 'DOW']
cols_keep = cols_grSale_keep + cols_weather_keep
diff = set(cols_df) - set(cols_keep)
cols_drop = [item for item in cols_df if item in diff]  # retaining column order of set difference
df.drop(cols_drop, axis=1, inplace=True)
print(f'df.columns: {df.columns}')
print(f'df.shape: {df.shape}')

df.columns: Index(['Date', 'Time', 'Category', 'Item', 'Qty', 'Gross Sales', 'Discounts', 'Net Sales', 'Tax', 'Transaction ID', 'Event Type', 'Dining Option', 'Customer Name', 'NAME', 'AWND', 'PRCP', 'TAVG', 'TMAX', 'TMIN', 'WT01', 'WT02', 'WT08', 'DOW'], dtype='object')
df.shape: (18366, 23)


In [8]:
df.TAVG = (df.TMAX + df.TMIN) / 2  # calculate avg. temp.
# replace Day-Of-Week from str to integer 
df['DOW'] = df['DOW'].replace(to_replace=['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'], value=[0, 1, 2, 3, 4, 5, 6])
# strip leading $, convert from object to float
_cols = ['Gross Sales', 'Discounts', 'Net Sales', 'Tax']
for _col in _cols:
  df[_col] = df[_col].str.split('$').str[-1]  # strip leading $
  df[_col] = pd.to_numeric(df[_col])  # convert from object to float
pickle_dump(df, path_pickle_dump, 'df_grSale_weather_cleaned')

In [9]:
df = pickle_load(path_pickle_dump, 'df_grSale_weather_cleaned')
print(f'df.shape: {df.shape}')
print(f'df.info(): {df.info()}')

df.shape: (18366, 23)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18366 entries, 0 to 18365
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            18366 non-null  object 
 1   Time            18366 non-null  object 
 2   Category        18366 non-null  object 
 3   Item            18366 non-null  object 
 4   Qty             18366 non-null  float64
 5   Gross Sales     18366 non-null  float64
 6   Discounts       18366 non-null  float64
 7   Net Sales       18366 non-null  float64
 8   Tax             18366 non-null  float64
 9   Transaction ID  18366 non-null  object 
 10  Event Type      18366 non-null  object 
 11  Dining Option   18043 non-null  object 
 12  Customer Name   11528 non-null  object 
 13  NAME            18366 non-null  object 
 14  AWND            18215 non-null  float64
 15  PRCP            18366 non-null  float64
 16  TAVG            18366 non-null  float64
 17  TMAX     

In [10]:
# https://datagy.io/pandas-groupby/
for column in df.columns[2::]: # skip columns: Date, Time
  s = df[column]
  grouped = s.groupby(s)
  print(f'{column}')
  print('='*20)
  _dict = {}
  for key in grouped.groups.keys():
    _count = grouped.get_group(key).count()
    _dict[key] = _count
  
  # sort items and counts in the column in reverse order 
  _dict = dict(sorted(_dict.items(), reverse = True, key=lambda item: item[1]))
  for k, v in _dict.items():
      print(f'{k:<45}{v:10,.0f}')
  print('='*20, '\n')

Category
Food                                             17,383
None                                                411
Beverage                                            197
Merch                                               171
RPG                                                 147
Coffee & Tea                                         57

Item
Danish                                            2,193
Cinn Knot                                         1,014
danish                                              978
2x choc                                             863
Choc Croissant                                      793
Choc crx                                            775
crx                                                 733
Twice Baked                                         698
choc crx                                            687
Croissant                                           675
ccc                                                 623
Crx                              

In [11]:
# how many rows have one or more valid inputs in columns: WT01, WT02, WT08'
_df = df[['WT01', 'WT02', 'WT08']]
_idx = _df.index[_df.notnull().any(axis=1)]
_df_notnull = _df.iloc[_idx]
print(f'{_df_notnull.shape[0]} rows have one or more valid inputs in columns: WT01, WT02, WT08')
print(f'_df_notnull.shape: {_df_notnull.shape}')

8491 rows have one or more valid inputs in columns: WT01, WT02, WT08
_df_notnull.shape: (8491, 3)
