# Pre Processing Iowa Liquor Database

1. Dealing with nan's
2. Expand the date column
3. One Hot Encoding categorical data
3. Rank stores by rtd & mezcal
4. Test Train Split
5. StandardScaler

In [1]:
# libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

pd.set_option("display.max_rows", 99)

In [2]:
# load data
iowa = pd.read_csv('iowa_eda.csv')

In [3]:
iowa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4807658 entries, 0 to 4807657
Data columns (total 23 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   Date                      object 
 1   Address                   object 
 2   City                      object 
 3   Zip Code                  int64  
 4   County                    object 
 5   Category Name             object 
 6   Item Number               int64  
 7   Item Description          object 
 8   Bottle Volume (ml)        int64  
 9   State Bottle Cost         float64
 10  State Bottle Retail       float64
 11  Bottles Sold              int64  
 12  Sale (Dollars)            float64
 13  Volume Sold (Liters)      float64
 14  Store Name                object 
 15  Category                  float64
 16  Vendor Name               object 
 17  Profit / Item             float64
 18  Profit / Invoice          float64
 19  Profit / ml               float64
 20  Profit / Invoice / Liter

In [4]:
iowa.head()

Unnamed: 0,Date,Address,City,Zip Code,County,Category Name,Item Number,Item Description,Bottle Volume (ml),State Bottle Cost,...,Volume Sold (Liters),Store Name,Category,Vendor Name,Profit / Item,Profit / Invoice,Profit / ml,Profit / Invoice / Liter,Retail Price / ml,Bottle Price Category
0,2018-09-04,804 Nile Kinnick Dr,ADEL,50003,DALLAS,VODKA,34821,Svedka Vodka,1750,13.0,...,525.0,Fareway stores #124 / adel,1032230.0,"CONSTELLATION WINE COMPANY, INC.",6.5,1950.0,0.003714,3.714286,0.011143,Normal
1,2018-09-04,804 Nile Kinnick Dr,ADEL,50003,DALLAS,BOURBON WHISKY,27408,Jim Beam Honey Mini,500,5.4,...,6.0,Fareway stores #124 / adel,1011500.0,JIM BEAM BRANDS,2.7,32.4,0.0054,5.4,0.0162,Inexpensive
2,2018-09-04,804 Nile Kinnick Dr,ADEL,50003,DALLAS,RYE WHISKY,27025,Bulleit 95 Rye,750,16.0,...,9.0,Fareway stores #124 / adel,1081500.0,DIAGEO AMERICAS,8.0,96.0,0.010667,10.666667,0.032,Normal
3,2018-09-04,804 Nile Kinnick Dr,ADEL,50003,DALLAS,SCOTCH,10008,Scoresby Rare Scotch,1750,10.5,...,10.5,Fareway stores #124 / adel,1012300.0,DIAGEO AMERICAS,5.25,31.5,0.003,3.0,0.009,Normal
4,2018-09-04,409 Nile Kinnick Dr,ADEL,50003,DALLAS,FLAVORED WHISKY,64858,Fireball Cinnamon Whiskey Mini Dispenser,3000,30.0,...,12.0,Kum & go #2093 / adel,1081600.0,SAZERAC COMPANY INC,15.0,60.0,0.005,5.0,0.015,Normal


### Preprocessing 1: dealing with nan's

In [5]:
# dealing with nans
iowa.isna().sum()

Date                           0
Address                        0
City                           0
Zip Code                       0
County                         0
Category Name               5726
Item Number                    0
Item Description               0
Bottle Volume (ml)             0
State Bottle Cost              2
State Bottle Retail            2
Bottles Sold                   0
Sale (Dollars)                 2
Volume Sold (Liters)           0
Store Name                     0
Category                       0
Vendor Name                    1
Profit / Item                  2
Profit / Invoice               2
Profit / ml                    2
Profit / Invoice / Liter       2
Retail Price / ml              2
Bottle Price Category          0
dtype: int64

#### Category will need to be dealt with. The rest are so few they will be dropped.

In [6]:
# fill category name by grouping them by their category number
iowa['Category Name'] = iowa.groupby('Category')['Category Name'].ffill().bfill()

In [7]:
iowa.isna().sum()

Date                        0
Address                     0
City                        0
Zip Code                    0
County                      0
Category Name               0
Item Number                 0
Item Description            0
Bottle Volume (ml)          0
State Bottle Cost           2
State Bottle Retail         2
Bottles Sold                0
Sale (Dollars)              2
Volume Sold (Liters)        0
Store Name                  0
Category                    0
Vendor Name                 1
Profit / Item               2
Profit / Invoice            2
Profit / ml                 2
Profit / Invoice / Liter    2
Retail Price / ml           2
Bottle Price Category       0
dtype: int64

In [8]:
# remove the rest
iowa = iowa.dropna()

### Pre Processing 2: Expand the Date column

In [9]:
# create columns for date info
iowa[['Date_year', 'Date_month', 'Date_day']] = iowa['Date'].str.split("-", expand = True)

In [10]:
# fix the data type. for the new columns because they are strings of numbers
iowa[['Date_year', 'Date_month', 'Date_day']] = iowa[['Date_year', 'Date_month', 'Date_day']].apply(pd.to_numeric)

### Pre Processing 3: Rank entries

In [30]:
ranking

Unnamed: 0,Store Name,Bottles Sold,Item Description
0,Central city 2,2966,12
1,Hy-vee #3 / bdi / des moines,2852,27
2,"Central city liquor, inc.",1493,25
3,Hy-vee food store / urbandale,907,9
4,Ingersoll liquor and beverage,678,13
5,Hy-vee # 6/ des moines,484,18
6,Hy-vee food store / fleur / dsm,389,8
7,Tequila's liquor store,359,18
8,Hy-vee wine and spirits / wdm,311,5
9,Hy-vee #4 / wdm,276,7


In [31]:
###
# find out which stores carry the most mezcal products
###

# create a df of only the mezcal entries
mezcal = iowa[iowa['Category Name'] == 'MEZCAL']

# take the mezcal entries, group them by store name and order them by highest variety of products
mezcal_bottles = pd.DataFrame(mezcal.groupby('Store Name')['Bottles Sold'].sum().sort_values(ascending=False))

# take the mezcal entries, group them by store name and order them by highest variety of products
mezcal_variety = pd.DataFrame(mezcal.groupby('Store Name')['Item Description'].nunique().sort_values(ascending=False))

# access the grouped columns
mezcal_bottles = mezcal_bottles.reset_index()
mezcal_variety = mezcal_variety.reset_index()

# join the individual tables
ranking = pd.merge(mezcal_bottles, mezcal_variety, on='Store Name')


####
# find out which stores dont carry the mezcal products
###


# create a df of no mezcal
no_mezcal = iowa[iowa['Category Name'] != 'MEZCAL']

# take the mezcal entries, get the store names
no_mezcal = pd.DataFrame(no_mezcal['Store Name'].unique())

#rename the store name column
no_mezcal = no_mezcal.rename(columns={0:'Store Name'})

# merge the two df's about mezcal
ranking = pd.concat([ranking, no_mezcal], ignore_index=True)

# fill nan's created in the join
ranking = ranking.fillna(0)

# rank them and store that in 
ranking['mezcal_bottles_rank'] = ranking['Bottles Sold'].rank(ascending = 0)
ranking['mezcal_variety_rank'] = ranking['Item Description'].rank(ascending = 0)

# create a ranking average, add them together divide by 2
ranking['total_ranking'] = (ranking['mezcal_bottles_rank'] + ranking['mezcal_variety_rank']) / 2 


In [33]:
ranking.sort_values('total_ranking')

Unnamed: 0,Store Name,Bottles Sold,Item Description,mezcal_bottles_rank,mezcal_variety_rank,total_ranking
1,Hy-vee #3 / bdi / des moines,2852.0,27.0,2.0,1.0,1.50
2,"Central city liquor, inc.",1493.0,25.0,3.0,2.0,2.50
0,Central city 2,2966.0,12.0,1.0,6.5,3.75
5,Hy-vee # 6/ des moines,484.0,18.0,6.0,3.5,4.75
4,Ingersoll liquor and beverage,678.0,13.0,5.0,5.0,5.00
...,...,...,...,...,...,...
223,Scott's foods,0.0,0.0,300.5,300.5,300.50
222,Casey's general store #3422 / norwal,0.0,0.0,300.5,300.5,300.50
221,Kum & go #240 / north ave norwalk,0.0,0.0,300.5,300.5,300.50
219,Kum & go #28 / norwalk,0.0,0.0,300.5,300.5,300.50


### Pre Processing 4: One Hot Encoding categorical data

In [14]:
# create numerical data for categorical data

one_hot = OneHotEncoder(sparse=False)

# create a df for the one hotted columns
iowa_one_hot = pd.DataFrame(one_hot.fit_transform(iowa[['City', 'County', 'Bottle Price Category', 'Zip Code']]))

iowa_one_hot.columns = one_hot.get_feature_names_out(['City', 'County', 'Bottle Price Category', 'Zip Code'])

# merge the one hot columns to the rest of the df
one_hot_iowa = pd.concat([iowa, iowa_one_hot])

ValueError: Length mismatch: Expected axis has 1 elements, new values have 105 elements

### Pre Processing 4: Test Train Split

### Pre Processing 5: Standard Scaler

In [None]:
scaler = StandardScaler()
scaler.fit_transform(iowa[['Bottle Volume (ml)', 'State Bottle Cost', 'State Bottle Retail', 'Bottles Sold', 'Sale (Dollars)', 'Volume Sold (Liters)', 'Profit / Item', 'Profit / ml']])
