## EDA: Iowa Liquor Sales

In [48]:
# load packages

import pandas as pd
import numpy as np

pd.set_option("display.max_rows", 999)

In [3]:
# load data
iowa = pd.read_csv('iowa_clean.csv')

In [4]:
iowa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22812222 entries, 0 to 22812221
Data columns (total 17 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Unnamed: 0            int64  
 1   Date                  object 
 2   City                  object 
 3   Zip Code              int64  
 4   County                object 
 5   Item Number           int64  
 6   Item Description      object 
 7   Bottle Volume (ml)    int64  
 8   State Bottle Cost     float64
 9   State Bottle Retail   float64
 10  Bottles Sold          int64  
 11  Sale (Dollars)        float64
 12  Volume Sold (Liters)  float64
 13  Store Name            object 
 14  Category Name         object 
 15  Category              float64
 16  Vendor Name           object 
dtypes: float64(5), int64(5), object(7)
memory usage: 2.9+ GB


In [5]:
iowa['Date'] = pd.to_datetime(iowa['Date'], format='%Y/%m/%d')

### Objectives:
1. Create columns for profit/ price variations
2. What does it mean to be expensive?
3. Where are expensive and inexpensive bottles sold? 
4. How is liquor type distributed across Iowa?
5. Create individual year csv files for working within tableau row restrictions

### 1. Create columns for profit variations

In [13]:
# create profit columns

# profit / bottle
iowa['Profit / Item'] = iowa['State Bottle Retail'] - iowa['State Bottle Cost']

# profit / invoice
iowa['Profit / Invoice'] = iowa['Bottles Sold'] * iowa['Profit / Item']

# profit / ml
iowa['Profit / ml'] = iowa['Profit / Item'] / iowa['Bottle Volume (ml)']

# profit / invoice / liter
iowa['Profit / Invoice / Liter'] = iowa['Profit / Invoice'] / iowa['Volume Sold (Liters)']

# retail price per ml
iowa['Retail Price / ml'] = iowa['State Bottle Retail'] / iowa['Bottle Volume (ml)']

In [14]:
iowa.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,0,1,2,3,4
Date,2018-09-24 00:00:00,2018-09-24 00:00:00,2018-09-24 00:00:00,2018-09-24 00:00:00,2018-09-24 00:00:00
City,ADAIR,ADAIR,ADAIR,ADAIR,ADAIR
Zip Code,50002,50002,50002,50002,50002
County,ADAIR,ADAIR,ADAIR,ADAIR,ADAIR
Item Number,86691,25606,37994,36304,26826
Item Description,Jack Daniels Tennessee Fire Mini,Seagrams 7 Crown Bl Whiskey,Smirnoff 80prf,Hawkeye Vodka,Jack Daniels Old #7 Black Lbl
Bottle Volume (ml),500,750,375,375,750
State Bottle Cost,9.06,7.0,4.75,1.86,15.57
State Bottle Retail,13.59,10.5,7.13,2.79,23.36


### 3. Where are expensive bottles sold? 

We need to come up with a definition or criteria for what it means to be expensive. 

I believe to be expensive it should be both: higher overall retail price, and higher in retail price per ml.

Now what means something is higher in a category?

Let's use standard deviations.

### Label creation
expensive 1 std
very expensive 2 std

In [15]:
print('Price per ml\n','Inexpensive Threshold: ', np.mean(iowa['Retail Price / ml']),'\n',
      'Normal Range: ', np.mean(iowa['Retail Price / ml']),'-', np.mean(iowa['Retail Price / ml']) + np.std(iowa['Retail Price / ml']),'\n',
      'Expensive Threshold: ', np.mean(iowa['Retail Price / ml']) + np.std(iowa['Retail Price / ml']),'\n',
      'Very Expensive Threshold: ', np.mean(iowa['Retail Price / ml']) + (2*np.std(iowa['Retail Price / ml']))
     )

Price per ml
 Inexpensive Threshold:  0.02686969062943565 
 Normal Range:  0.02686969062943565 - 0.09871797460224166 
 Expensive Threshold:  0.09871797460224166 
 Very Expensive Threshold:  0.17056625857504767


In [16]:
print('Bottle Prices\n','Inexpensive Threshold: ', np.mean(iowa['State Bottle Retail']),'\n',
      'Normal Range: ', np.mean(iowa['State Bottle Retail']),'-', np.mean(iowa['State Bottle Retail']) + np.std(iowa['State Bottle Retail']),'\n',
      'Expensive Threshold: ', np.mean(iowa['State Bottle Retail']) + np.std(iowa['State Bottle Retail']),'\n',
      'Very Expensive Threshold: ', np.mean(iowa['State Bottle Retail']) + (2*np.std(iowa['State Bottle Retail']))
     )

Bottle Prices
 Inexpensive Threshold:  15.212815194955656 
 Normal Range:  15.212815194955656 - 31.385144827853033 
 Expensive Threshold:  31.385144827853033 
 Very Expensive Threshold:  47.55747446075041


In [17]:
# variables for label creation
inexp_ml = 0.02686969062943565
exp_ml = 0.09871797460224166
very_exp_ml = 0.17056625857504767

inexp_bottle = 15.212815194955656 
exp_bottle = 31.385144827853033 
very_exp_bottle = 47.55747446075041

In [18]:
def price(entry):
    
    #very expensive criteria
    if (entry['Retail Price / ml'] >= very_exp_ml) & (entry['State Bottle Retail'] >= very_exp_bottle):
        return 'Very expensive'
    
    # expensive criteria
    elif (entry['Retail Price / ml'] >= exp_ml) & (entry['State Bottle Retail'] >= exp_bottle):
        return 'Expensive'

    # inexpensive criteria
    elif (entry['Retail Price / ml'] <= inexp_ml) & (entry['State Bottle Retail'] <= inexp_bottle):
        return 'Inexpensive'
    
    # normal is everything else
    else:
        return 'Normal'

In [19]:
# create a new column for pricing category
iowa['Bottle Price Category'] = iowa.apply(lambda x: price(x), axis=1)

In [20]:
# How do people buy alcohol?
iowa['Bottle Price Category'].value_counts()

Inexpensive       13084290
Normal             9590484
Expensive           102458
Very expensive       34990
Name: Bottle Price Category, dtype: int64

### Observation:
1. I set the criteria for inexpensive to be below average. So about half should be included.
2. None of these categories are over 15M rows (tableau cut off)

In [42]:
# groupby exploration function

def grouping(df, col_one, col_two, col_three):
    
    # Group 1
    group = df.groupby([col_one, col_two]).agg({col_three : 'sum'})
    
    # turning group into percentages
    g_group = group.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))
    
    return g_group

In [43]:
county_BPC = grouping(iowa, 'County', 'Bottle Price Category', 'Bottles Sold')

In [52]:
county_BPC

Unnamed: 0_level_0,Unnamed: 1_level_0,Bottles Sold
County,Bottle Price Category,Unnamed: 2_level_1
ADAIR,Expensive,0.18563
ADAIR,Inexpensive,68.420424
ADAIR,Normal,31.371993
ADAIR,Very expensive,0.021953
ADAMS,Expensive,0.101086
ADAMS,Inexpensive,63.033714
ADAMS,Normal,36.854862
ADAMS,Very expensive,0.010338
ALLAMAKEE,Expensive,0.035917
ALLAMAKEE,Inexpensive,61.28906


### Observation:
1. Dallas is has the highest percentage of expensive bottles sold at 0.224%
2. Fremont has a massive outlier of 0.25% for Very expensive bottles sold

In [53]:
cate_type = pd.DataFrame(grouping(iowa, 'Category Name', 'Bottle Price Category', 'Bottles Sold'))
cate_type

Unnamed: 0_level_0,Unnamed: 1_level_0,Bottles Sold
Category Name,Bottle Price Category,Unnamed: 2_level_1
BOURBON WHISKY,Expensive,0.083582
BOURBON WHISKY,Inexpensive,35.273429
BOURBON WHISKY,Normal,64.634862
BOURBON WHISKY,Very expensive,0.008128
BRANDY,Expensive,0.052541
BRANDY,Inexpensive,56.944153
BRANDY,Normal,42.913085
BRANDY,Very expensive,0.09022
CANADIAN WHISKY,Expensive,0.033829
CANADIAN WHISKY,Inexpensive,52.929789


### Observation
1. Some categories do not have expensive or very expensive
2. mezcal doesn't have inexpensive
3. mezcal 1.03% Expensive
4. high proof beer is only expensive and very expensive
5. Iowa Local has the highest Very Expensive 0.7%

In [75]:
# group by zip 
# avg bottle prices
# largest
pd.DataFrame(iowa.groupby(['City','Zip Code'])['State Bottle Retail'].mean()).nlargest(20, 'State Bottle Retail')

Unnamed: 0_level_0,Unnamed: 1_level_0,State Bottle Retail
City,Zip Code,Unnamed: 2_level_1
ALLERTON,50008,44.43818
EARLING,51530,24.93544
LUXEMBURG,52056,22.221368
SHUEYVILLE,52338,22.067589
CEDAR RAPIDS,52401,21.073991
ST LUCAS,52166,20.67686
MORAVIA,52571,20.184365
WHITTEMORE,50598,19.873916
VAN HORNE,52346,19.843253
PLEASANT HILL,50438,19.678333


In [73]:
# group by zip
# avg bottle prices
# smallest
pd.DataFrame(iowa.groupby(['Zip Code', 'City'])['State Bottle Retail'].mean()).nsmallest(20, 'State Bottle Retail')

Unnamed: 0_level_0,Unnamed: 1_level_0,State Bottle Retail
Zip Code,City,Unnamed: 2_level_1
50044,BUSSEY,7.314627
50336,BOONE,10.09825
52303,CEDAR RAPIDS,10.354822
52004,DUBUQUE,10.744518
52223,DELHI,10.78
52248,KEOTA,10.796196
52049,GARNAVILLO,10.808351
51005,AURELIA,10.828124
50206,NEWTON,10.847404
50529,DAKOTA CITY,10.910359


### Observation:
1. Bottle Price per zip  varies from $44 to $7 
2. Overall average bottle price is $14

Thoughts something might be wrong with the 50008 invoices.

<b> Dallas who has the highest % of all bottles labeled expensive is 2nd

### 3. How is liquor Type distributed across the state?

In [103]:
# type explore
pd.DataFrame(iowa.groupby(['Category Name']).agg({'Bottles Sold':'sum', 'State Bottle Retail':'mean', 'Store Name':'nunique', 'County':'nunique', 'City':'nunique','Zip Code': 'nunique', })).sort_values(by='Bottles Sold',ascending=False)

Unnamed: 0_level_0,Bottles Sold,State Bottle Retail,Store Name,County,City,Zip Code
Category Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
VODKA,60813562,12.304692,2532,99,451,495
CANADIAN WHISKY,26677962,14.748443,2517,99,450,495
RUM,23996598,13.192256,2522,99,452,496
WHISKY,18742294,19.177461,2539,99,451,494
FLAVORED WHISKY,17281360,17.579011,2499,99,449,494
SCHNAPPS,13900906,10.980206,2480,99,447,492
BRANDY,12911726,13.725677,2477,99,446,491
BOURBON WHISKY,10925819,18.713763,2496,99,450,494
LIQUEUR,10898070,18.164068,2502,99,450,495
TEQUILA,10645078,21.70905,2502,99,449,494


### Observation
1. Whisky has the most different stores at: 2539
2. around 2500 means ubiquitous
3. of ubiquitous categories tequila has the highest average retail price
4. Mezcal is only represented in 456 different stores,while has \$30 avg per bottle.
5. Scotch has avg is \$28.89 and is in 1950 stores
6. High alcohol Beer has the fewest retail locations, and the average price is by far the highest. I believe all high alcohol beer is in a keg form.

In [102]:
# which store buys the most per category
iowa.groupby(['Category Name'])['Store Name'].value_counts

<bound method SeriesGroupBy.value_counts of <pandas.core.groupby.generic.SeriesGroupBy object at 0x7fe44ad41af0>>

In [91]:
mezcal = iowa[iowa['Category Name'] == 'MEZCAL']

In [95]:
mezcal.groupby(['Store Name'])['Bottles Sold'].sum().nlargest()

Store Name
Central city 2                                     2966
Hy-vee #3 / bdi / des moines                       2850
Benz distributing                                  2545
Central city liquor, inc.                          1493
Hy-vee wine and spirits / iowa city                1483
John's grocery                                      976
Hy-vee food store / urbandale                       907
Ingersoll liquor and beverage                       678
Hy-vee food store / coralville                      653
Wilkie liquors                                      651
Hy-vee wine and spirits / bettendorf                618
Cyclone liquors                                     590
Hy-vee #3 food and drugstore                        508
Hy-vee # 6/ des moines                              484
Bootleggin' barzini's fin                           440
Bootlegging barzinis                                435
Hy-vee #2 / coralville                              411
Happy's wine & spirits               

In [97]:
# Des Moines sales by zip
DM_sales_zip = iowa[iowa['City'] == 'DES MOINES']
DM_sales_zip = DM_sales_zip.groupby('Zip Code').agg({'Volume Sold (Liters)': 'sum', 'Sale (Dollars)': 'sum'}).sort_values(by='Volume Sold (Liters)', ascending=False)
DM_sales_zip

Unnamed: 0_level_0,Volume Sold (Liters),Sale (Dollars)
Zip Code,Unnamed: 1_level_1,Unnamed: 2_level_1
50314,6383718.14,119029200.0
50320,6232585.65,107893600.0
50315,2077120.69,32046510.0
50317,2046424.418,31632260.0
50310,1369804.01,19681800.0
50321,1278194.751,20109260.0
50311,779200.738,12244430.0
50316,753468.658,13215160.0
50312,632946.972,12482840.0
50313,559632.155,8107857.0


### 2. Create CSV files for Tableau

In [98]:
# creating csv files for individual year subsets

# range of years
for year in np.arange(2012,2022):
    
    #subsetting of years from 'Date' column
   name = iowa.loc[(iowa['Date'] > (str(year-1)+'-12-31')) & (iowa['Date'] < (str(year+1)+'-01-01'))]

    # assign df subset to a variable
   vars()['iowa_' + str(year)]=name

    # save that subset as a csv file
   vars()['iowa_' + str(year)].to_csv(('iowa_' + str(year) + '.csv'))

In [99]:
# Bottle Price Category split

inexpensive = iowa[iowa['Bottle Price Category'] == 'Inexpensive']
normal_expensive = iowa[iowa['Bottle Price Category'] != 'Inexpensive']

In [100]:
inexpensive.to_csv('inexpensive.csv',index=False)
normal_expensive.to_csv('expensive.csv',index=False)

In [101]:
iowa.to_csv('iowa_eda.csv',index=False)