# Pre Processing Iowa Liquor Database

1. Dealing with nan's
2. Expand the date column
3. Rank stores by rtd & mezcal
3. Create function for liquor searching
3. One Hot Encoding categorical data
4. Test Train Split
5. StandardScaler

In [20]:
# libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split

pd.set_option("display.max_rows", 999)

In [2]:
# load data
iowa = pd.read_csv('iowa_eda.csv')

In [3]:
iowa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4807658 entries, 0 to 4807657
Data columns (total 23 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   Date                      object 
 1   Address                   object 
 2   City                      object 
 3   Zip Code                  int64  
 4   County                    object 
 5   Category Name             object 
 6   Item Number               int64  
 7   Item Description          object 
 8   Bottle Volume (ml)        int64  
 9   State Bottle Cost         float64
 10  State Bottle Retail       float64
 11  Bottles Sold              int64  
 12  Sale (Dollars)            float64
 13  Volume Sold (Liters)      float64
 14  Store Name                object 
 15  Category                  float64
 16  Vendor Name               object 
 17  Profit / Item             float64
 18  Profit / Invoice          float64
 19  Profit / ml               float64
 20  Profit / Invoice / Liter

In [5]:
iowa.head()

Unnamed: 0,Date,Address,City,Zip Code,County,Category Name,Item Number,Item Description,Bottle Volume (ml),State Bottle Cost,...,Volume Sold (Liters),Store Name,Category,Vendor Name,Profit / Item,Profit / Invoice,Profit / ml,Profit / Invoice / Liter,Retail Price / ml,Bottle Price Category
0,2018-09-04,804 Nile Kinnick Dr,ADEL,50003,DALLAS,VODKA,34821,Svedka Vodka,1750,13.0,...,525.0,Fareway stores #124 / adel,1032230.0,"CONSTELLATION WINE COMPANY, INC.",6.5,1950.0,0.003714,3.714286,0.011143,Normal
1,2018-09-04,804 Nile Kinnick Dr,ADEL,50003,DALLAS,BOURBON WHISKY,27408,Jim Beam Honey Mini,500,5.4,...,6.0,Fareway stores #124 / adel,1011500.0,JIM BEAM BRANDS,2.7,32.4,0.0054,5.4,0.0162,Inexpensive
2,2018-09-04,804 Nile Kinnick Dr,ADEL,50003,DALLAS,RYE WHISKY,27025,Bulleit 95 Rye,750,16.0,...,9.0,Fareway stores #124 / adel,1081500.0,DIAGEO AMERICAS,8.0,96.0,0.010667,10.666667,0.032,Normal
3,2018-09-04,804 Nile Kinnick Dr,ADEL,50003,DALLAS,SCOTCH,10008,Scoresby Rare Scotch,1750,10.5,...,10.5,Fareway stores #124 / adel,1012300.0,DIAGEO AMERICAS,5.25,31.5,0.003,3.0,0.009,Normal
4,2018-09-04,409 Nile Kinnick Dr,ADEL,50003,DALLAS,FLAVORED WHISKY,64858,Fireball Cinnamon Whiskey Mini Dispenser,3000,30.0,...,12.0,Kum & go #2093 / adel,1081600.0,SAZERAC COMPANY INC,15.0,60.0,0.005,5.0,0.015,Normal


### Preprocessing 1: dealing with nan's

In [6]:
# dealing with nans
iowa.isna().sum()

Date                           0
Address                        0
City                           0
Zip Code                       0
County                         0
Category Name               5726
Item Number                    0
Item Description               0
Bottle Volume (ml)             0
State Bottle Cost              2
State Bottle Retail            2
Bottles Sold                   0
Sale (Dollars)                 2
Volume Sold (Liters)           0
Store Name                     0
Category                       0
Vendor Name                    1
Profit / Item                  2
Profit / Invoice               2
Profit / ml                    2
Profit / Invoice / Liter       2
Retail Price / ml              2
Bottle Price Category          0
dtype: int64

#### Category will need to be dealt with. The rest are so few they will be dropped.

In [7]:
# fill category name by grouping them by their category number
iowa['Category Name'] = iowa.groupby('Category')['Category Name'].ffill().bfill()

In [8]:
iowa.isna().sum()

Date                        0
Address                     0
City                        0
Zip Code                    0
County                      0
Category Name               0
Item Number                 0
Item Description            0
Bottle Volume (ml)          0
State Bottle Cost           2
State Bottle Retail         2
Bottles Sold                0
Sale (Dollars)              2
Volume Sold (Liters)        0
Store Name                  0
Category                    0
Vendor Name                 1
Profit / Item               2
Profit / Invoice            2
Profit / ml                 2
Profit / Invoice / Liter    2
Retail Price / ml           2
Bottle Price Category       0
dtype: int64

In [9]:
# remove the rest
iowa = iowa.dropna()

### Pre Processing 2: Expand the Date column

In [10]:
# create columns for date info
iowa[['Date_year', 'Date_month', 'Date_day']] = iowa['Date'].str.split("-", expand = True)

In [11]:
# fix the data type. for the new columns because they are strings of numbers
iowa[['Date_year', 'Date_month', 'Date_day']] = iowa[['Date_year', 'Date_month', 'Date_day']].apply(pd.to_numeric)

### Pre Processing 3: Rank entries

In [12]:
###
# find out which stores carry the most mezcal products
###

# create a df of only the mezcal entries
mezcal = iowa[iowa['Category Name'] == 'MEZCAL']

# take the mezcal entries, group them by store name and order them by highest variety of products
mezcal_bottles = pd.DataFrame(mezcal.groupby('Store Name')['Bottles Sold'].sum().sort_values(ascending=False))

# take the mezcal entries, group them by store name and order them by highest variety of products
mezcal_variety = pd.DataFrame(mezcal.groupby('Store Name')['Item Description'].nunique().sort_values(ascending=False))

# access the grouped columns
mezcal_bottles = mezcal_bottles.reset_index()
mezcal_variety = mezcal_variety.reset_index()

# join the individual tables
ranking = pd.merge(mezcal_bottles, mezcal_variety, on='Store Name')


####
# find out which stores dont carry the mezcal products
###


# create a df of no mezcal
no_mezcal = iowa[iowa['Category Name'] != 'MEZCAL']

# take the mezcal entries, get the store names
no_mezcal = pd.DataFrame(no_mezcal['Store Name'].unique())

#rename the store name column
no_mezcal = no_mezcal.rename(columns={0:'Store Name'})

# merge the two df's about mezcal
ranking = pd.concat([ranking, no_mezcal], ignore_index=True)

# fill nan's created in the join
ranking = ranking.fillna(0)

# rank them and store that in 
ranking['mezcal_sales_rank'] = ranking['Bottles Sold'].rank(ascending = 0)
ranking['mezcal_variety_rank'] = ranking['Item Description'].rank(ascending = 0)

# create a ranking average, add them together divide by 2
ranking['total_ranking'] = (ranking['mezcal_sales_rank'] * ranking['mezcal_variety_rank'])


In [14]:
# create a dictionary for mapping 
rank_map_dict = dict(zip(ranking['Store Name'], ranking['total_ranking']))

In [15]:
# map over the ranking value
iowa['total_ranking'] = iowa['Store Name'].replace(rank_map_dict.keys(), list(map(str, rank_map_dict.values())), regex=True)

In [16]:
iowa.shape

(4807655, 27)

In [17]:
iowa.head(50)

Unnamed: 0,Date,Address,City,Zip Code,County,Category Name,Item Number,Item Description,Bottle Volume (ml),State Bottle Cost,...,Profit / Item,Profit / Invoice,Profit / ml,Profit / Invoice / Liter,Retail Price / ml,Bottle Price Category,Date_year,Date_month,Date_day,total_ranking
0,2018-09-04,804 Nile Kinnick Dr,ADEL,50003,DALLAS,VODKA,34821,Svedka Vodka,1750,13.0,...,6.5,1950.0,0.003714,3.714286,0.011143,Normal,2018,9,4,90300.25
1,2018-09-04,804 Nile Kinnick Dr,ADEL,50003,DALLAS,BOURBON WHISKY,27408,Jim Beam Honey Mini,500,5.4,...,2.7,32.4,0.0054,5.4,0.0162,Inexpensive,2018,9,4,90300.25
2,2018-09-04,804 Nile Kinnick Dr,ADEL,50003,DALLAS,RYE WHISKY,27025,Bulleit 95 Rye,750,16.0,...,8.0,96.0,0.010667,10.666667,0.032,Normal,2018,9,4,90300.25
3,2018-09-04,804 Nile Kinnick Dr,ADEL,50003,DALLAS,SCOTCH,10008,Scoresby Rare Scotch,1750,10.5,...,5.25,31.5,0.003,3.0,0.009,Normal,2018,9,4,90300.25
4,2018-09-04,409 Nile Kinnick Dr,ADEL,50003,DALLAS,FLAVORED WHISKY,64858,Fireball Cinnamon Whiskey Mini Dispenser,3000,30.0,...,15.0,60.0,0.005,5.0,0.015,Normal,2018,9,4,90300.25
5,2018-09-04,804 Nile Kinnick Dr,ADEL,50003,DALLAS,VODKA,34008,Absolut Swedish Vodka 80 Prf,1750,19.99,...,10.0,60.0,0.005714,5.714286,0.017137,Normal,2018,9,4,90300.25
6,2018-09-04,804 Nile Kinnick Dr,ADEL,50003,DALLAS,VODKA,36306,Hawkeye Vodka,750,3.34,...,1.67,20.04,0.002227,2.226667,0.00668,Inexpensive,2018,9,4,90300.25
7,2018-09-04,409 Nile Kinnick Dr,ADEL,50003,DALLAS,FLAVORED WHISKY,64864,Fireball Cinnamon Whisky,375,5.33,...,2.67,32.04,0.00712,7.12,0.021333,Inexpensive,2018,9,4,90300.25
8,2018-09-04,804 Nile Kinnick Dr,ADEL,50003,DALLAS,RUM,43336,Captain Morgan Original Spiced,750,9.06,...,4.53,217.44,0.00604,6.04,0.01812,Inexpensive,2018,9,4,90300.25
9,2018-09-04,804 Nile Kinnick Dr,ADEL,50003,DALLAS,FLAVORED VODKA,41704,Uv Red (cherry) Vodka,750,6.5,...,3.25,39.0,0.004333,4.333333,0.013,Inexpensive,2018,9,4,90300.25


### Pre Processing Part 4: Create a function to support liquor buying

the function 'des_moines_liquor_search()' takes a desired liquor type input and returns the top 10 ranked stores

In [18]:
## define a function to help liquor buyers find alcohol

def des_moines_liquor_search():
    
    # ascii art from 'https://patorjk.com/software/taag/#p=display&h=3&f=Cyberlarge&t=Liquor%20Finder'

    print("""
                                Welcome to the Des Moines Area liquor finder.
#############################################################################################################
            _____  _____  _     _  _____   ______      _______ _____ __   _ ______  _______  ______
     |        |   |   __| |     | |     | |_____/      |______   |   | \  | |     \ |______ |_____/
     |_____ __|__ |____\| |_____| |_____| |    \_      |       __|__ |  \_| |_____/ |______ |    \_

#############################################################################################################

            Please enter the style of alcohol you are looking for from this list.""",
          """
          
          """,
iowa['Category Name'].unique())
    
    # user input for desired liquor type
    gold = input('Liquor type: ').upper()

    # create a df of only the liquor entries
    liquor = iowa[iowa['Category Name'] == gold]

    # take the mezcal entries, group them by store name and order them by highest variety of products
    liquor_bottles = pd.DataFrame(liquor.groupby('Store Name')['Bottles Sold'].sum().sort_values(ascending=False))

    # take the mezcal entries, group them by store name and order them by highest variety of products
    liquor_variety = pd.DataFrame(liquor.groupby('Store Name')['Item Description'].nunique().sort_values(ascending=False))

    # access the grouped columns
    liquor_bottles = liquor_bottles.reset_index()
    liquor_variety = liquor_variety.reset_index()

    # join the individual tables
    ranking = pd.merge(liquor_bottles, liquor_variety, on='Store Name')

    # rank them and store that in 
    ranking[gold + '_sales_rank'] = ranking['Bottles Sold'].rank(ascending = 0)
    ranking[gold + '_variety_rank'] = ranking['Item Description'].rank(ascending = 0)

    # create a ranking average, add them together divide by 2
    ranking['total_ranking'] = (ranking[gold + '_sales_rank'] * ranking[gold + '_variety_rank'])
    
    # create top 10 results
    
    # sort by the new rankings and reset the index so best result is 0
    sample = ranking.sort_values(by=['total_ranking']).reset_index()
    
    # keep only top 10 results and the store name
    sample = sample.iloc[0:10,1]
    
    # ranking number won't be meaningful to citizens so we will use index as rank because they are sorted
    #sample.index = sample.index + 1
    
    # return best stores for that liquor type
    indy = 0
    
    print("""
    
    These are the best options for """+gold,"""
    
    """)
    for x in sample:    
        #### access the address
        indy += 1
        address_bool = (iowa['Store Name'] == x)

        address = iowa[address_bool].reset_index()

        address.index = address.index + indy

        print(address[['Store Name','Address','City']].head(1),"""
            """)


### Pre Processing 5: One Hot Encoding categorical data

In [23]:
# create numerical data for categorical data

one_hot = OneHotEncoder(sparse=False)

# create a df for the one hotted columns
iowa_one_hot = pd.DataFrame(one_hot.fit_transform(iowa[['Bottle Price Category', 'City', 'County', 'Zip Code', 'Category Name']]))

iowa_one_hot.columns = one_hot.get_feature_names_out(['Bottle Price Category', 'City', 'County', 'Zip Code', 'Category Name'])

# merge the one hot columns to the rest of the df
one_hot_iowa = iowa.join(iowa_one_hot)

In [25]:
one_hot_iowa.shape

(4807655, 156)

### Pre Processing 6: Remove the mezcal rows

In [36]:
# create a dataframe for test/train/split that doesnt carry the invoices with mezcal
iowa_mezcal_free = one_hot_iowa[one_hot_iowa['Category Name'] != 'MEZCAL']

### Pre Processing 7: Test Train Split

In [40]:
# what categories do I want in each split
print(list(iowa_mezcal_free.columns))

['Date', 'Address', 'City', 'Zip Code', 'County', 'Category Name', 'Item Number', 'Item Description', 'Bottle Volume (ml)', 'State Bottle Cost', 'State Bottle Retail', 'Bottles Sold', 'Sale (Dollars)', 'Volume Sold (Liters)', 'Store Name', 'Category', 'Vendor Name', 'Profit / Item', 'Profit / Invoice', 'Profit / ml', 'Profit / Invoice / Liter', 'Retail Price / ml', 'Bottle Price Category', 'Date_year', 'Date_month', 'Date_day', 'total_ranking', 'Bottle Price Category_Expensive', 'Bottle Price Category_Inexpensive', 'Bottle Price Category_Normal', 'Bottle Price Category_Very expensive', 'City_ADEL', 'City_ALTOONA', 'City_ANKENY', 'City_BEVINGTON', 'City_BONDURANT', 'City_CARLISLE', 'City_CASEY', 'City_CLIVE', 'City_CUMMING', 'City_DALLAS CENTER', 'City_DE SOTO', 'City_DES MOINES', 'City_DEXTER', 'City_EARLHAM', 'City_GRANGER', 'City_GRIMES', 'City_GUTHRIE CENTER', 'City_INDIANOLA', 'City_JOHNSTON', 'City_MARTENSDALE', 'City_MILO', 'City_MITCHELLVILLE', 'City_NEW VIRGINIA', 'City_NORWALK

In [48]:
# drop columns
iowa_mezcal_free = iowa_mezcal_free.drop(columns=['Address', 'City', 'Zip Code', 'County', 'Category Name',
                                                 'Item Description','Store Name','Category','Vendor Name',
                                                 'Bottle Price Category', 'Date'])

In [49]:
# create X and y

# dependent variable
iowa_y = iowa_mezcal_free['total_ranking']

# independent variable
iowa_X = iowa_mezcal_free.drop(columns=['total_ranking'])

In [50]:
# split into a training and testing data sets.

X_train, X_test, y_train, y_test = train_test_split(iowa_X, iowa_y, test_size=0.2, random_state=99)

### Pre Processing 8: Standard Scaler

In [51]:
# standardize the data so they are all in the same format

# assign the standard scaler to a variable
scaler = StandardScaler()

#fit the scaler ON ONLY THE TRAINING SET
scaler.fit(X_train[['Bottle Volume (ml)', 'State Bottle Cost', 'State Bottle Retail', 
                    'Bottles Sold', 'Sale (Dollars)', 'Volume Sold (Liters)', 'Profit / Item',
                    'Profit / Invoice', 'Profit / ml', 'Profit / Invoice / Liter', 'Retail Price / ml']])

# process both the independent and dependent data through the standard scaler
# assigning them to new variables

X_train[['Bottle Volume (ml)', 'State Bottle Cost', 'State Bottle Retail', 
                    'Bottles Sold', 'Sale (Dollars)', 'Volume Sold (Liters)', 'Profit / Item',
                    'Profit / Invoice', 'Profit / ml', 'Profit / Invoice / Liter', 'Retail Price / ml']] = scaler.transform(X_train[['Bottle Volume (ml)', 'State Bottle Cost', 'State Bottle Retail', 
                    'Bottles Sold', 'Sale (Dollars)', 'Volume Sold (Liters)', 'Profit / Item',
                    'Profit / Invoice', 'Profit / ml', 'Profit / Invoice / Liter', 'Retail Price / ml']])


X_test[['Bottle Volume (ml)', 'State Bottle Cost', 'State Bottle Retail', 
                    'Bottles Sold', 'Sale (Dollars)', 'Volume Sold (Liters)', 'Profit / Item',
                    'Profit / Invoice', 'Profit / ml', 'Profit / Invoice / Liter', 'Retail Price / ml']] = scaler.transform(X_test[['Bottle Volume (ml)', 'State Bottle Cost', 'State Bottle Retail', 
                    'Bottles Sold', 'Sale (Dollars)', 'Volume Sold (Liters)', 'Profit / Item',
                    'Profit / Invoice', 'Profit / ml', 'Profit / Invoice / Liter', 'Retail Price / ml']])

In [54]:
# save progress
iowa.to_csv('iowa_pre_pro.csv',index=False)

one_hot_iowa.to_csv('iowa_one_hotted.csv',index=False)

X_test.to_csv('X_test.csv',index=False)

X_train.to_csv('X_train.csv',index=False)

y_test.to_csv('y_test.csv',index=False)

y_train.to_csv('y_train.csv',index=False)