In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import matplotlib as plt
import seaborn as sns
import re
import plotly.express as px

In [3]:
full_raw = pd.read_csv('data/Iowa_Liquor_Sales.csv',
                       dtype={'Zip Code': object, 'Item Number': object}, 
                       parse_dates=['Date'])

KeyboardInterrupt: 

# Explore the data set

In [None]:
full_raw.shape

In [None]:
full_raw.dtypes

In [None]:
full_raw.head(3)

In [None]:
# Invoice column is filled with all unique values. We probably do not need
full_raw['Invoice/Item Number'].nunique()

### Explore if prices differ between vendors and stores
- Turns out that Iowa is an alcoholic beverage control state and they monopolize liquor wholesale. Price probably does not change much because of monopoly, and we do not see different vendors at the same time selling the same product perhaps because Iowa decides to buy that particular item from different vendor

In [None]:
# making a subset of single day
mar = full_raw.loc[full_raw.Date == '05/30/2012']

# multiple transactions of same item
p_sold_mar = mar['Item Number'].nunique()
v_sell_mar = mar['Vendor Number'].nunique()
print(f'Number of unique product sold that day: {p_sold_mar}')
print(f'Number of transactions that day: {mar.shape[0]}')
# so we want to keep transaction ID if need granularity
print(f'Number of unique product sold that day: {v_sell_mar}')


In [None]:
mar.shape

In [None]:
# Groupby to find item sold most on this day
mar.groupby(['Item Number'
            ]).agg({'Vendor Number': 'value_counts'
                   }).rename(columns = {'Vendor Number':'Vendor Count'
                                       }).reset_index().sort_values('Vendor Count', 
                                                                    ascending=False)


In [None]:
# most sold item is still only sold by one vendor
mar.loc[mar['Item Number']=='11788']['Vendor Number'].unique()

#### Different way to get at the question: Are vendors selling same item?

In [None]:
# Is item sold by multiple vendors
# In pivot table index/row is item number, and column is vendor number, counting nulls along axis 1 (counting along rows) shows if item has multiple vendors
# Used describe to find out if there is any variation - none for march
mar[['Item Number', 'Vendor Number',
     'Bottles Sold']].pivot_table(index='Item Number', 
                                  columns= 'Vendor Number' ).isnull().sum(axis=1).describe()

In [None]:
# Check the same for full data
# Dont want to run again because it loads copies the raw

# temp = full_raw[['Item Number', 'Vendor Number','Bottles Sold']].pivot_table(index='Item Number', columns= 'Vendor Number' ).isnull().sum(axis=1)
# temp.head()

In [None]:
full_raw['Item Number'].nunique()
# 9587 Unique items

In [None]:
full_raw['Vendor Number'].nunique()

#### Check for items that have items with mult vendors

In [None]:
# 344 vendors, if temp less thand 343 has multiple vendors
temp.loc[temp < 343]
# 426 products that have multiple vendors out of 11594 items

In [None]:
# Look at one of the listed items above, see the different vendors
full_raw.loc[full_raw['Item Number']=='36447']['Vendor Number'].unique()
# array([154., 229., 402., 626.])

#### Check to see if vendors stratified over time

In [None]:
# Quick check to see if change happens over dates
item_36447 = full_raw.loc[full_raw['Item Number']=='36447'][['Vendor Number',"Date"]]
temp = item_36447.copy().drop('Date', axis=1)
temp.index = item_36447['Date']
temp= temp.sort_values('Date')
temp.loc[lambda x: x['Vendor Number']==154].tail()

In [None]:
# Looks like the next vendor starts in March 2016 after the time above
temp.loc[lambda x: x['Vendor Number']!=154] 

## Preliminary EDA

In [None]:
# trying to get most sold item
mar_item = mar.groupby('Item Number').sum()
mar_item.sort_values('Bottles Sold', ascending=False).head()

In [None]:
# Very basic EDA, should think of more targeted way to look at the data
full_raw.hist('Volume Sold (Liters)', bins=30)

## Overall Seasonality

In [None]:
full_time = full_raw.copy()
full_time.index = pd.to_datetime(full_time['Date'], format= '%m/%d/%Y')

In [None]:
# overall yearly sale
yr_sale = full_time.groupby(full_time.index.year)['Sale (Dollars)'].sum()
yr_sale.plot()

In [None]:
# Overall monthly sales
mo_sale = full_time.groupby(full_time.index.month)['Sale (Dollars)'].sum()
mo_sale.plot()

In [None]:
# 2012 example October spike
yr12 = full_time.loc[full_time.index.year == 2012]

yr12.groupby(yr12.index.month)['Sale (Dollars)'].sum().plot()

In [None]:
# 2017 example more spikes in summer, october, and december
yr17 = full_time.loc[full_time.index.year == 2017]

yr17.groupby(yr17.index.month)['Sale (Dollars)'].sum().plot()

In [None]:
# 2020 example more spikes in summer, october, and december
yr20 = full_time.loc[full_time.index.year == 2020]

yr20.groupby(yr20.index.month)['Sale (Dollars)'].sum().plot()

In [None]:
# Ploting seasonality over the years
full_time.groupby([(full_time.index.year), (full_time.index.month)])['Sale (Dollars)'].sum().plot()


In [None]:
# mo_facet = sns.FacetGrid(full_time, col='')

### Check State Bottle Cost and Retail are multicolinear

In [None]:
markup = full_raw['State Bottle Cost']/full_raw['State Bottle Retail']

In [None]:
markup.dropna().describe()

In [None]:
plt.hist(markup)

In [None]:
markup.hist(bins = 100)

### Clean Stores data

- Store ID
- Name
- City
- Zip
- Location
- County number
- County

In [None]:
full_raw.head(1)

In [None]:
store2 = full_raw.loc[:,['Store Number', 'Store Name', 'Address', 'City', 'Zip Code', 'Store Location', 'County Number', 'County']]

In [None]:
store2.head()

In [None]:
store.head()

In [None]:
# Store df
store = full_raw.loc[:,['Store Number', 'Store Name', 'Address', 'City', 'Zip Code', 'Store Location', 'County Number', 'County']]
# lower case name
store.loc[:,'Store Name'] = store['Store Name'].str.lower()
store.loc[:,'Address'] = store['Address'].str.lower()
store.loc[:,'City'] = store['City'].str.lower()
store.loc[:,'County'] = store['County'].str.lower()

In [None]:
# Check for missingness in store
np.sum(store.isnull())

In [None]:
# replace null with string so the NaN is not contagious
store.replace(np.nan, 'missing', inplace = True)

In [None]:
# see if there are redundancies
ns_name = store['Store Name'].nunique()
ns_address = store['Address'].nunique()
ns_city = store['City'].nunique()
ns_zip = store['Zip Code'].nunique()
ns_location = store['Store Location'].nunique()
ns_county = store['County'].nunique()
print(f'Unique store names: {ns_name}')
print(f'Unique store address: {ns_address}')
print(f'Unique store city: {ns_city}')
print(f'Unique store zip: {ns_zip}')
print(f'Unique store location: {ns_location}')
print(f'Unique store county: {ns_county}')
store['Store Number'].nunique()

In [None]:
# Clean up store names
store_name = store.groupby(['Store Number']).agg({'Store Name': 
                                                  'value_counts'}).rename(columns = {'Store Name': 
                                                                                     'Name Count'}).reset_index()
store_name.head()

In [None]:
# rank the store names within each store number
store_name['Rank'] = store_name.groupby('Store Number')['Name Count'].rank(ascending = False)
store_name.head()

In [None]:
# pick most frequent name
store_name_unique = store_name.loc[store_name.Rank == 1].drop(['Name Count', 'Rank'], axis=1)

# drop columns from store
store = store.drop(['Store Name'], axis=1)

# join the ranked data to store df
store = pd.merge(store, store_name_unique, how='left', 
                 on='Store Number')
store.head()

In [None]:
# Clean up locations
# Business could have moved over the years
store_loc = store.groupby(['Store Number', 
                           'County', 'County Number', 'City', 
                           'Zip Code', 'Store Location']).agg({'Address': 
                                                               'value_counts'}).rename(columns = {'Address': 
                                                                                     'Address Count'}).reset_index()
store_loc.head()

In [None]:
# rank all location info within each store number
store_loc['Rank'] = store_loc.groupby('Store Number')['Address Count'].rank(ascending = False)
store_loc.head()

In [None]:

# pick most frequent name
store_loc_unique = store_loc.loc[store_loc.Rank == 1].drop(['Address Count', 'Rank'], axis=1)

# drop columns from store
store = store.drop(['Address','City', 'Zip Code', 'Store Location', 'County Number', 'County'], axis=1)

# join the ranked data to store df
store = pd.merge(store, store_loc_unique, how='left', 
                 on='Store Number')
store.head()

In [None]:
store.drop_duplicates(inplace=True)

In [None]:
store.shape

In [None]:
mis_loc_lst = store2.loc[store2['Store Location'].isnull()]['Store Number'].unique().tolist()

In [None]:
# checking the missing in store location 
def missing_loc(x):
    return [full_raw.loc[full_raw['Store Number']==y]['Store Location'].nunique() for y in x]
    
    
# full_raw.loc[full_raw['Store Number']==3908]['Store Location'].nunique()
 = missing_loc(mis_loc_lst)


In [None]:
price = full_raw[['Item Number', 'Date', 'State Bottle Retail', 'State Bottle Cost']]

In [None]:
price.groupby(['Item Number', 'Date']).agg({'State Bottle Retail':'mean','State Bottle Cost':'mean' }).shape

In [None]:
store2 = store.replace('missing', np.nan)
np.sum(store2.isnull())

In [None]:
# np.sum(store.isnull()) # the old null without missing replacement

In [None]:
store2.drop_duplicates().shape

In [None]:
store3 = store2.drop_duplicates()

In [None]:
np.sum(store3.isnull())

In [None]:
store.head()

In [None]:
store.drop_duplicates().shape

In [None]:
full_raw['Store Number'].nunique()

In [None]:
np.sum(full_raw.isnull())

In [None]:
full_raw.loc[full_raw['City'].isnull()]['Store Number'].nunique()

In [None]:
full_raw.loc[full_raw['Store Location'].isnull()]#['Store Number'].nunique()
# 279 stores

In [None]:
full_raw['Store Number'].nunique()

In [None]:
['Address'].replace(np.nan, 'missing')

In [None]:
full_raw.loc[full_raw['City'].isnull()].replace(np.nan, 'missing')#['Store Number'].nunique()
# 120 Stores

In [None]:
full_raw.loc[full_raw['Store Number']==4152]

In [None]:
full_raw.loc[full_raw['Store Number']==3908]

## Categories Cleaning

### Store type

In [None]:
store.head(1)

#### Create column of zeros as placeholder

In [None]:
store['Store Type'] = np.zeros(store.shape[0])

#### Categorize stores with string contains 
- order matters here

In [None]:
store['Store Type'].loc[store['Store Name'].str.contains(
    "food|market|super valu|saver|groc")] = 'Other Grocery or Convenience'

store['Store Type'].loc[store['Store Name'].str.contains(
    "econ-o-mart|mart|quik|pit stop|quick|kwik|general store|convenience|gas|circle k|petro|stop|casey's|country store|yesway|kum|7-eleven|station|express|fill r up|fuel|new star| go |the boonedocks")] = 'Convenience Store'

store['Store Type'].loc[store['Store Name'].str.contains(
    "hy-vee|wal-mart|fareway store|super mar|big g|sac city food pride|sam's club|supermarket|shop n save|grocery|target|dahl's|costco|whole foods|jeff's|hometown|mepo")] = 'Supermarket'

store['Store Type'].loc[store['Store Name'].str.contains(
    "liquor|spirits|tobacco|beverage|smoke|bottle|distil|wine|bootleg|northside one stop|beer|cigar|distrib|booze|brew|snuff|sauce")] = 'Liquor Tobacco Store'

store['Store Type'].loc[store['Store Name'].str.contains(
    "walgreens|cvs|drug")] = 'Drug Store'

store['Store Type'].loc[store['Store Name'].str.contains(
    "casino")] = 'Casino'

store['Store Type'].loc[store['Store Type']==0] = 'Other'

In [None]:
store['Store Type'].unique()

#### Check leftover names to make more str.contains tags

In [None]:
store.loc[store['Store Type'] == 0]['Store Name'].unique()

#### Value count leftover store names to make tags for most common stores (different store id have same name)

In [None]:
store.loc[store['Store Type']==0]['Store Name'].value_counts()

#### Breakdown of category numbers

In [None]:
n_othergroc = store.loc[store['Store Type']=='Other Grocery or Convenience']['Store Name'].nunique()
n_conv = store.loc[store['Store Type']=='Convenience Store']['Store Name'].nunique()
n_super = store.loc[store['Store Type']=='Supermarket']['Store Name'].nunique()
n_liq = store.loc[store['Store Type']=='Liquor Tobacco Store']['Store Name'].nunique()
n_drug = store.loc[store['Store Type']=='Drug Store']['Store Name'].nunique()
n_casino = store.loc[store['Store Type']=='Casino']['Store Name'].nunique()
n_remainder = store.loc[store['Store Type']==0]['Store Name'].nunique()

print(f'Number of Other Grocery and Convenience: {n_othergroc}')
print(f'Number of Convenience: {n_conv}')
print(f'Number of Supermarket: {n_super}')
print(f'Number of Liquor and Tobacco Stores: {n_liq}')
print(f'Number of Drug Stores: {n_drug}')
print(f'Number of Casinos: {n_casino}')
print(f'Number of Remainder: {n_remainder}')

In [None]:
store.head()

In [None]:
store['Store Type'].loc[store['Store Name']==0] = 'Other'

In [None]:
store.sample(20)

In [None]:
store.to_csv('data/store.csv',index_label=False)

In [None]:
store.shape

In [None]:
store_final=store.drop_duplicates()

In [None]:
store_final.shape

In [None]:

store_final.to_csv('data/store.csv',index_label=False)

In [None]:
store_final.shape

In [1]:
store =pd.read_csv('data/store.csv')
store.head()

NameError: name 'pd' is not defined

In [None]:
product =pd.read_csv('data/product_clean_categorized.csv')
product.head()

In [None]:
full_raw.columns

In [None]:
temp =pd.merge(full_raw.drop(['Invoice/Item Number', 'Store Name', 'Address', 'City', 
                        'Zip Code', 'Store Location', 'County Number', 'County', 
                        'Category Name', 'Item Description', 'Bottle Volume (ml)',
                        'Category', 'Volume Sold (Gallons)'], axis=1), 
         store, how='left', on = 'Store Number')

In [None]:
temp.head()

In [None]:
temp = pd.merge(temp, product, how='left', left_on='Item Number', right_on='product_id')

In [None]:
temp.head()

In [None]:
combined = 

In [None]:
store_type_vol = temp.groupby(['Store Type', pd.Grouper(key='Date', freq='M')])[['Volume Sold (Liters)']].sum()

In [None]:
store_type_vol.head()

In [None]:
store_type_vol.reset_index(inplace=True)

In [None]:
store_type_vol_p = store_type_vol.pivot(index='Date', columns = 'Store Type', values = 'Volume Sold (Liters)')

In [None]:
store_type_vol_p.reset_index(inplace=True)

In [None]:
store_type_vol_p

### Plot to show over time volume sales by store type

In [None]:
fig = px.line(store_type_vol_p, x='Date', y=store_type_vol_p.columns)
fig

In [None]:
temp.drop(['Store Number', 'Vendor Number', 'County Number', 'category_id'],axis=1).sample(100000).groupby('Store Type').describe()

In [None]:
# fig = px.bar(temp, x='Store Type', y='Volume Sold (Liters)')
# fig

In [None]:
# Median volume sale per store type
fig = px.bar(temp[['Store Type','Volume Sold (Liters)']].groupby('Store Type').agg('median').reset_index().sort_values(by='Volume Sold (Liters)'), x='Store Type', y='Volume Sold (Liters)')
fig

In [None]:
fig = px.bar(temp[['category_new','Volume Sold (Liters)']].groupby('category_new').agg('median').reset_index().sort_values(by='Volume Sold (Liters)'), x='category_new', y='Volume Sold (Liters)')
fig

In [None]:
import plotly.graph_objects as go
fig = go.Figure(data= temp[['category_new','Volume Sold (Liters)']].groupby('category_new').agg('median').reset_index().sort_values(by='Volume Sold (Liters)'), x='category_new', y='Volume Sold (Liters)')
fig

In [None]:
store_type_vol.head()

In [None]:
### What stores selling what products
# group by store type and product type count_values
#

In [None]:
store_prod = temp.groupby(['Store Type', 'category_new']).agg({'Volume Sold (Liters)':'sum'}).reset_index()

In [None]:
store_prod.head()

In [None]:
fig = px.bar(store_prod, x='Store Type', y='Volume Sold (Liters)', color='category_new', barmode='relative')

fig

In [None]:
fig = px.bar(store_prod, color='Store Type', y='Volume Sold (Liters)', x='category_new')
fig

In [None]:
.agg({'Address':'value_counts'}).rename(columns = {'Address': 'Address Count'}).reset_index()

In [None]:
# Stratified downsampling for boxplots with respect to categories
# Fusion of categories downsampling
# How to prove that you did not introduce bias


In [None]:
# like stratified k fold downsampling
# sample and write loops 
# distributions the same before and after downsampling