#### Weather Data Definition  
STATION = Weather station ID  
NAME = Weather station location  
Date = Date  
AWND = Average daily wind speed (meters per second or miles per hour as per user preference)  
PGTM = Peak gust time (hours and minutes, i.e., HHMM)  
PRCP = Precipitation (mm or inches as per user preference, inches to hundredths on Daily Form pdf file)  
TMAX = Maximum temperature, F  
TMIN = Minimum temperature, F  
WDF2 = Direction of fastest 2-minute wind (degrees)  
WDF5 = Direction of fastest 5-second wind (degrees)  
WSF2 = Fastest 2-minute wind speed (miles per hour or meters per second as per user preference)  
WSF5 = Fastest 5-second wind speed (miles per hour or meters per second as per user preference)  
WT01 = Fog, ice fog, or freezing fog (may include heavy fog)  
WT02 = Heavy fog or heaving freezing fog (not always distinguished from fog)  
WT08 = Smoke or haze  

In [None]:
import sys
import pandas as pd
sys.path.append("C:/Users/ping/MyDrive/py_files/python/py379/")
from myUtils import pickle_load, pickle_dump
pd.set_option('max_colwidth', 12, 'display.max_columns', 18, 'display.width', 1200, 'display.max_rows', 100)
INPUT_DIR = 'C:/Users/ping/OneDrive/Documents/jenn_bb_sales'
path_pickle_dump = f'{INPUT_DIR}/'

In [None]:
df = pickle_load(path_pickle_dump, 'df_item_sale_n_weather_raw')
print(f'df.shape: {df.shape}')

In [None]:
'''drop wholesale customers '''
df = df[df['Customer Name'] != 'Canyon Coffee']
print(f'df.shape: {df.shape}')

In [None]:
'''In columns with string dtype:'''
'''  strip leading and trailing spacee'''
'''  converts first character of each word to uppercase'''
# https://stackoverflow.com/questions/65756553/check-if-entire-pandas-object-column-is-a-string
for column in df.columns:
  # infer column cell type: 'string' or 'floating'
  col_type = pd.api.types.infer_dtype(df[column])
  if col_type == 'string':
    # https://www.datasciencemadesimple.com/strip-space-column-pandas-dataframe-leading-trailing-2/
    df[column] = df[column].str.strip()
    df[column] = df[column].str.title()

In [None]:
df.columns

In [None]:
'''Combine same items'''
df.Item = df.Item.replace(to_replace=['Double Chocolate Espresso Cookie', '2X Choc'], value='Double Chocolate Cookie')
df.Item = df.Item.replace(to_replace=['Choc Croissant', 'Choc Crx'], value='Chocolate Croissant')
df.Item = df.Item.replace(to_replace=['Crx'], value='Croissant')
df.Item = df.Item.replace(to_replace=['Ccc'], value='Chocolate Chip Cookie')
df.Item = df.Item.replace(to_replace=['Cinn Knot'], value='Cinnamon Knot')
df.Item = df.Item.replace(to_replace=['Bluerberry Muffin', 'Bb Muf'], value='Blueberry Muffin')

In [None]:
'''group and count items in the Item column'''
s = df.Item
grouped = s.groupby(s)
_dict = {}
for key in grouped.groups.keys():
  _count = grouped.get_group(key).count()
  _dict[key] = _count

_dict = dict(sorted(_dict.items(), reverse = True, key=lambda item: item[1]))
for k, v in _dict.items():
    print(f'{k:<45}{v:10,.0f}')

In [None]:
'''drop columns that are not needed'''
cols_df = df.columns.tolist()
cols_grSale_keep = \
  ['Date', 'Time', 'Category', 'Item', 'Qty', 'Gross Sales', 'Discounts', 'Net Sales', 'Tax', 'Transaction ID', 'Event Type', 'Dining Option', 'Customer Name']
cols_weather_keep = ['NAME', 'AWND', 'PRCP', 'TAVG', 'TMAX', 'TMIN', 'WT01', 'WT02', 'WT08', 'DOW']
cols_keep = cols_grSale_keep + cols_weather_keep
diff = set(cols_df) - set(cols_keep)
cols_drop = [item for item in cols_df if item in diff]  # retaining column order of set difference
df.drop(cols_drop, axis=1, inplace=True)
print(f'df.columns: {df.columns}')
print(f'df.shape: {df.shape}')

In [None]:
'''calculate avg. temp.'''
df.TAVG = (df.TMAX + df.TMIN) / 2

In [None]:
'''replace Day-Of-Week from str to integer''' 
df['DOW'] = df['DOW'].replace(to_replace=['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'], value=[0, 1, 2, 3, 4, 5, 6])

In [None]:
'''strip leading $, convert from object to float'''
_cols = ['Gross Sales', 'Discounts', 'Net Sales', 'Tax']
for _col in _cols:
  df[_col] = df[_col].str.split('$').str[-1]  # strip leading $
  df[_col] = pd.to_numeric(df[_col])  # convert from object to float
pickle_dump(df, path_pickle_dump, 'df_item_sale_n_weather_cleaned')

In [None]:
df = pickle_load(path_pickle_dump, 'df_item_sale_n_weather_cleaned')
print(f'df.shape: {df.shape}')
print(f'df.info(): {df.info()}')

In [None]:
'''group and count items in each column'''
# https://datagy.io/pandas-groupby/
for column in df.columns[2::]: # skip columns: Date, Time
  s = df[column]
  grouped = s.groupby(s)
  print(f'Column: {column}')
  print('='*20)
  _dict = {}
  for key in grouped.groups.keys():
    _count = grouped.get_group(key).count()
    _dict[key] = _count
  
  # sort items and counts in the column in reverse order 
  _dict = dict(sorted(_dict.items(), reverse = True, key=lambda item: item[1]))
  for k, v in _dict.items():
      print(f'{k:<45}{v:10,.0f}')
  print('='*20, '\n')

In [None]:
# how many rows have one or more valid inputs in columns: WT01, WT02, WT08'
_df = df[['WT01', 'WT02', 'WT08']]
_idx = _df.index[_df.notnull().any(axis=1)]
_df_notnull = _df.iloc[_idx]
print(f'{_df_notnull.shape[0]} rows have one or more valid inputs in columns: WT01, WT02, WT08')
print(f'_df_notnull.shape: {_df_notnull.shape}')