In [1]:
# load packages

import pandas as pd
import numpy as np

In [2]:
# CSV file is from "https://data.iowa.gov/Sales-Distribution/Iowa-Liquor-Sales/m3tr-qhgy"

# create variable to store dataset

#### sample of full dataset for github
iowaLiquorSales = pd.read_csv('iowa_sample.csv')

In [3]:
# take a look at the data

iowaLiquorSales.head()

Unnamed: 0,Date,Store Number,Store Name,Address,City,Zip Code,Store Location,County Number,County,Category,...,Vendor Name,Item Number,Item Description,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Type
0,2017-04-05,3390,okoboji avenue liquor,1610 OKOBOJI AVENUE,MILFORD,51351,POINT (-95.149955 43.331525),30,DICKINSON,1032200,...,DIAGEO AMERICAS,64716,Ciroc Peach,750,16.49,24.74,12,332.88,9.0,VODKA
1,2016-10-19,5222,leo1 / cedar rapids,1500 1ST AVE NE,CEDAR RAPIDS,52402,POINT (-91.652082 41.988229),57,LINN,1031200,...,EANDJ GALLO WINERY,40597,New Amsterdam Red Berry,200,2.49,3.74,24,80.64,4.8,VODKA
2,2017-04-05,2606,hy-vee wine and spirits / humboldt,1011 13TH ST NORTH,HUMBOLDT,50548,POINT (-94.227116 42.733221),46,HUMBOLDT,1031100,...,PHILLIPS BEVERAGE,37348,Phillips Vodka,1750,7.6,11.4,6,68.4,10.5,VODKA
3,2017-04-05,3901,target store t-0878 / fort dodge,2910 1 AVE S,FORT DODGE,50501,POINT (-94.157366 42.505335),94,WEBSTER,1011600,...,INFINIUM SPIRITS,27102,Templeton 4YR Rye,750,18.09,27.14,6,162.84,4.5,RYE
4,2016-10-17,3917,smokin' joe's #2 tobacco and liquor outlet,1606 W LOCUST ST,DAVENPORT,52804,POINT (-90.599037 41.538254),82,SCOTT,1012100,...,DIAGEO AMERICAS,10789,Crown Royal Vanilla,200,4.72,7.08,2,14.16,0.4,WHISKY


In [4]:
# and it's size

iowaLiquorSales.shape

(1000, 22)

In [5]:
# checking for missing
iowaLiquorSales.isna().sum()

Date                    0
Store Number            0
Store Name              0
Address                 0
City                    0
Zip Code                0
Store Location          0
County Number           0
County                  0
Category                0
Category Name           0
Vendor Number           0
Vendor Name             0
Item Number             0
Item Description        0
Bottle Volume (ml)      0
State Bottle Cost       0
State Bottle Retail     0
Bottles Sold            0
Sale (Dollars)          0
Volume Sold (Liters)    0
Type                    0
dtype: int64

##### Are the NaN values specific to a certain location? Would dropping them under represent an area
##### zip code is a consistent location tracker
##### iowa has 1055 zip codes


In [6]:
# ZIP CODE
iowaLiquorSales['Zip Code'].nunique()

216

In [7]:
# zip code type
iowaLiquorSales['Zip Code'].dtypes

dtype('int64')

In [8]:
# change zipcode to a number

# this one is an error but let me know I had a non-numerical entry
#iowaLiquorSales['Zip Code'] = iowaLiquorSales['Zip Code'].astype('float64')

In [9]:
# how many have that random entry
iowaLiquorSales[iowaLiquorSales['Zip Code'] == '712-2']

Unnamed: 0,Date,Store Number,Store Name,Address,City,Zip Code,Store Location,County Number,County,Category,...,Vendor Name,Item Number,Item Description,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Type


###### 7940 entries, and at a quick glance it is the same location. 

In [10]:
# Let's see if it is only the cityb of Dunlap

# create a dummy dataframe of only the odd zip code entries
df = iowaLiquorSales[iowaLiquorSales['Zip Code'] == '712-2']

# group the dummy dataframe by city, and I chose sum to print the 'df'
df.groupby(by='City').sum()

Unnamed: 0_level_0,Store Number,Zip Code,County Number,Category,Vendor Number,Item Number,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters)
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1


###### all the odd zip code entries are from the city of dunlap
Dunlap which has a actual zip code of 51529 and turns out their area code is 712
###### change all 712 zip codes to 51529

In [11]:
# replace all of those entries to the correct zip code
iowaLiquorSales['Zip Code'] = iowaLiquorSales['Zip Code'].replace({'712-2': 51529})

In [12]:
# examine different zip codes
iowaLiquorSales['Zip Code'].unique()

array([51351, 52402, 50548, 50501, 52804, 51012, 50316, 50273, 50266,
       50317, 50702, 52761, 51331, 50212, 50112, 52314, 52807, 50311,
       50125, 50060, 52753, 50010, 52627, 52241, 51040, 51105, 51503,
       52001, 52802, 50613, 50265, 51566, 50401, 50248, 52556, 50310,
       50208, 52002, 52405, 50158, 51108, 52327, 50322, 50701, 51501,
       52205, 50574, 50588, 50021, 50023, 50543, 51054, 52240, 51401,
       52317, 50601, 52806, 50616, 52501, 50461, 51104, 51055, 50320,
       50263, 52175, 51346, 50314, 50659, 50131, 52040, 51529, 50022,
       51632, 52411, 52333, 51106, 50703, 52732, 52577, 52246, 52722,
       50595, 50309, 50321, 50315, 50312, 50585, 51250, 50511, 50801,
       50517, 51248, 52544, 50583, 50707, 50313, 51537, 51360, 52404,
       50006, 50009, 51103, 50219, 50682, 50622, 50677, 50047, 52003,
       52353, 50014, 51601, 50111, 50138, 52342, 50142, 52655, 52233,
       50213, 52403, 50436, 51442, 52632, 51546, 50628, 50036, 51534,
       50651, 52733,

##### some float, some are int, some are str and one nan

In [13]:
# remove that one null entry by saving notna values
iowaLiquorSales = iowaLiquorSales[iowaLiquorSales['Zip Code'].notna()]

In [14]:
# converting all zip codes to numeric
iowaLiquorSales['Zip Code'] = iowaLiquorSales['Zip Code'].astype('int64')

In [15]:
# how are data types for the whole dataframe
iowaLiquorSales.dtypes

Date                     object
Store Number              int64
Store Name               object
Address                  object
City                     object
Zip Code                  int64
Store Location           object
County Number             int64
County                   object
Category                  int64
Category Name            object
Vendor Number           float64
Vendor Name              object
Item Number               int64
Item Description         object
Bottle Volume (ml)        int64
State Bottle Cost       float64
State Bottle Retail     float64
Bottles Sold              int64
Sale (Dollars)          float64
Volume Sold (Liters)    float64
Type                     object
dtype: object

In [16]:
# now that zip code has been cleaned
# we can groupby zipcode to see if the missing gps locations are in the same place

df = iowaLiquorSales[['Store Location', 'Zip Code']]

In [17]:
# get rid of the null values in dummy dataframe
df['Store Location'] = df['Store Location'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Store Location'] = df['Store Location'].fillna(0)


In [18]:
# focus on just the missing locations
dff = df[df['Store Location'] == 0]

In [19]:
# how many missing values are there per zip code
dff.value_counts()


Series([], dtype: int64)

In [20]:
# missing store location values are spread out enough to dropna()

In [21]:
# drop null values
iowa_liq_sales = iowaLiquorSales.dropna()

In [22]:
# drop unneeded columns
iowa_liq_sales = iowa_liq_sales.drop(columns=['Invoice/Item Number','Pack','Volume Sold (Gallons)'])

KeyError: "['Invoice/Item Number' 'Pack' 'Volume Sold (Gallons)'] not found in axis"

In [None]:
# current state
iowa_liq_sales.head()

In [None]:
# check column data types
iowa_liq_sales.dtypes

In [None]:
# CLEAN UP DATA ENTRIES

#DATE COLUMN

iowa_liq_sales['Date'].describe(datetime_is_numeric=True)

In [None]:
# 'Date' column is also an object type. let's change that to date time with pandas.to_datetime()
iowa_liq_sales['Date'] = pd.to_datetime(iowa_liq_sales['Date'])

In [None]:
# STORE NUMBER COLUMN

#Store Number column
iowa_liq_sales['Store Number'].head()

In [None]:
# how many different store numbers are there?
iowa_liq_sales['Store Number'].nunique()

In [None]:
# STORE NAME COLUMN

iowa_liq_sales['Store Name'].describe()

In [None]:
# there's 2830 unique store names
# examine a sample
iowa_liq_sales['Store Name'].head(25)

In [None]:
# There are more Store names than numbers, let's see if we can clean this up

In [None]:
# clean up some
iowa_liq_sales['Store Name'] = iowa_liq_sales['Store Name'].str.lower()

In [None]:
iowa_liq_sales['Store Name'] = iowa_liq_sales['Store Name'].str.replace('&','and')

In [None]:
iowa_liq_sales['Store Name'] = iowa_liq_sales['Store Name'].str.replace('# ','#')

In [None]:
iowa_liq_sales['Store Name'] = iowa_liq_sales['Store Name'].str.replace('  ',' ')

In [None]:
iowa_liq_sales['Store Name'].nunique()

In [None]:
# from 2830 to 2622  want 2526

In [None]:
# ADDRESS

iowa_liq_sales['Address'].describe()

In [None]:
# take a look at address's
iowa_liq_sales['Address'].head(25)

In [None]:
# clean up addresses to limit duplicates
iowa_liq_sales['Address'] = iowa_liq_sales['Address'].str.upper()

In [None]:
iowa_liq_sales['Address'] = iowa_liq_sales['Address'].str.replace(',','')

In [None]:
iowa_liq_sales['Address'] = iowa_liq_sales['Address'].str.replace('.','')

In [None]:
iowa_liq_sales['Address'] = iowa_liq_sales['Address'].str.replace('  ',' ')

In [None]:
# look at the new look
iowa_liq_sales['Address'].head(25)

In [None]:
iowa_liq_sales['Address'].nunique()

In [None]:
# CITY
iowa_liq_sales["City"].describe()

In [None]:
# check for any incorrect entries
iowa_liq_sales['City'].unique()

###### some are all caps let's standardize that

In [None]:
# convert all cities to upper case
iowa_liq_sales['City'] = iowa_liq_sales['City'].str.upper()

In [None]:
# compare to earlier unique values
iowa_liq_sales['City'].nunique()

In [None]:
# further exploration distinct combinations

(~iowa_liq_sales.duplicated(['Store Number','Store Name', 'Zip Code'])).sum(), (~iowa_liq_sales.duplicated(['Store Number','Store Name', 'Address'])).sum()

In [None]:
# STORE LOCATION

#I have not used gps data before but these seem to be in the correct format.

In [None]:
# COUNTY NUMBER

iowa_liq_sales['County Number'] = iowa_liq_sales['County Number'].astype('int64')

In [None]:
# a quick google search of: number of counties in iowa, 99 is correct

iowa_liq_sales['County Number'].nunique()

In [None]:
# COUNTY

# we know it is supposed to be 99
iowa_liq_sales['County'].nunique()

In [None]:
# examine
iowa_liq_sales['County'].head(25)

In [None]:
# make all county names capital
iowa_liq_sales['County'] = iowa_liq_sales['County'].str.upper()

In [None]:
# did we catch 'em all?
iowa_liq_sales['County'].nunique()

In [None]:
iowa_liq_sales['County'].unique()

#### okay we have 4 too many, a read through: buena vist probably is buena vista, cerro gord is cerro gordo, o'brien and obrien, and pottawatta is pottawattamie

In [None]:
# swap back and forth
iowa_liq_sales['County'] = iowa_liq_sales['County'].str.replace('OBRIEN',"O'BRIEN")
iowa_liq_sales['County'] = iowa_liq_sales['County'].str.replace('VISTA','VIST')
iowa_liq_sales['County'] = iowa_liq_sales['County'].str.replace('CERRO GORDO','CERRO GORD')
iowa_liq_sales['County'] = iowa_liq_sales['County'].str.replace('POTTAWATTAMIE','POTTAWATTA')

iowa_liq_sales['County'] = iowa_liq_sales['County'].str.replace('VIST','VISTA')
iowa_liq_sales['County'] = iowa_liq_sales['County'].str.replace('CERRO GORD','CERRO GORDO')
iowa_liq_sales['County'] = iowa_liq_sales['County'].str.replace('POTTAWATTA','POTTAWATTAMIE')

In [None]:
# did we catch 'em all?
iowa_liq_sales['County'].unique()

In [None]:
iowa_liq_sales['County'].nunique()

In [None]:
# CATEGORY
iowa_liq_sales['Category'].nunique()

In [None]:
# take a look
iowa_liq_sales['Category'].unique()

In [None]:
iowa_liq_sales['Category'].dtype

In [None]:
# let's remove the trailing period
iowa_liq_sales['Category'] = iowa_liq_sales['Category'].astype('int64')

In [None]:
# CATEGORY NAME
iowa_liq_sales['Category Name'].nunique()

###### this is greater than the amount of category numbers

In [None]:
# so take a look
iowa_liq_sales['Category Name'].unique()

In [None]:
# try standardizing letter case
iowa_liq_sales['Category Name'] = iowa_liq_sales['Category Name'].str.upper()

In [None]:
# we want 109
iowa_liq_sales['Category Name'].nunique()

In [None]:
iowa_liq_sales['Category Name'].unique()

In [None]:
# I see a few double spaces
iowa_liq_sales['Category Name'] = iowa_liq_sales['Category Name'].str.replace('  ', ' ')

In [None]:
# we want 109
iowa_liq_sales['Category Name'].nunique()

In [None]:
# after reading through the list it was 'COCKTAILS /RTD' and 'COCKTAILS / RTD'
# however also reading through the list these might be tax distinctions rather than specific alchohol groups
iowa_liq_sales['Category Name'] = iowa_liq_sales['Category Name'].str.replace('COCKTAILS /RTD', 'COCKTAILS / RTD')

In [None]:
# we want 109
iowa_liq_sales['Category Name'].nunique()

In [None]:
# checking to see if these combinations line up
iowa_liq_sales.groupby('Category')['Category Name'].unique().head(25)

In [None]:
# create a function to create less specific column for liquor type

def type(x):
    #search through values to apply a new label
    if "IOWA" in x:
        return "IOWA LOCAL"
    elif "TEMPORARY" in x:
        return "SPECIAL PACKAGING"
    elif "HOLIDAY" in x:
        return "SPECIAL PACKAGING"
    elif "DELISTED" in x:
        return "DELISTED"
    elif "LIQUEUR" in x:
        return "LIQUEUR"
    elif "ROCK & RYE" in x:
        return "LIQUEUR"
    elif "AMARETTO" in x:
        return "LIQUEUR"
    elif "SCHNAPPS" in x:
        return "LIQUEUR"
    elif 'TRIPLE SEC' in x:
        return "LIQUEUR"
    elif 'SLOE' in x:
        return "LIQUEUR"
    elif "CREME" in x:
        return 
    elif "RYE" in x:
        return "RYE"
    elif "RUM" in x:
        return "RUM"
    elif "SCOTCH" in x:
        return "SCOTCH"
    elif "BOURBON" in x:
        return "BOURBON"
    elif "TEQUILA" in x:
        return "TEQUILA" 
    elif "MEZCAL" in x:
        return "MEZCAL" 
    elif "VODKA" in x:
        return "VODKA"
    elif "GIN" in x:
        return "GIN"
    elif "BRANDIES" in x:
        return "BRANDY"
    elif "WHISK" in x:
        return "WHISKY"
    elif "NEUTRAL" in x:
        return "NEUTRAL GRAIN"
    elif "SPECIAL" in x:
        return "SPECIALTY"
    elif "COCKTAIL" in x:
        return 'COCKTAILS / RTD'
    elif "HIGH PROOF BEER" in x:
        return 'HIGH PROOF BEER'
    
    else:
        return x

In [None]:
# apply that function to create a new column
iowa_liq_sales['Type'] = iowa_liq_sales['Category Name'].apply(lambda x: type(x))

In [None]:
iowa_liq_sales['Type'].unique()

In [None]:
iowa_liq_sales[iowa_liq_sales['Type'] == "AMERICAN ALCOHOL"]['Item Description'].unique()

In [None]:
# VENDOR NUMBER
iowa_liq_sales['Vendor Number'].nunique()

In [None]:
# VENDOR NAME
iowa_liq_sales['Vendor Name'].nunique()

In [None]:
#check for obvious duplicates
iowa_liq_sales['Vendor Name'].unique()

In [None]:
#standardize 
iowa_liq_sales['Vendor Name'] = iowa_liq_sales['Vendor Name'].str.upper()

In [None]:
# clean up some more
iowa_liq_sales['Vendor Name'] = iowa_liq_sales['Vendor Name'].str.replace('LLC','')

In [None]:
iowa_liq_sales['Vendor Name'] = iowa_liq_sales['Vendor Name'].str.replace('INC','')

In [None]:
iowa_liq_sales['Vendor Name'] = iowa_liq_sales['Vendor Name'].str.replace(',','')

In [None]:
iowa_liq_sales['Vendor Name'] = iowa_liq_sales['Vendor Name'].str.replace('.','')

In [None]:
iowa_liq_sales['Vendor Name'] = iowa_liq_sales['Vendor Name'].str.rstrip()

In [None]:
iowa_liq_sales['Vendor Name'] = iowa_liq_sales['Vendor Name'].str.replace('-',' ')

In [None]:
iowa_liq_sales['Vendor Name'] = iowa_liq_sales['Vendor Name'].str.replace('&',' AND ')

In [None]:
iowa_liq_sales['Vendor Name'] = iowa_liq_sales['Vendor Name'].str.replace('LTD','')

In [None]:
iowa_liq_sales['Vendor Name'] = iowa_liq_sales['Vendor Name'].str.replace('CORPORATION','CORP')

In [None]:
iowa_liq_sales['Vendor Name'] = iowa_liq_sales['Vendor Name'].str.replace('SAZERAC NORTH AMERICA','SAZERAC COMPANY')

In [None]:
iowa_liq_sales['Vendor Name'] = iowa_liq_sales['Vendor Name'].str.replace('/',' AND ')

In [None]:
iowa_liq_sales['Vendor Name'] = iowa_liq_sales['Vendor Name'].str.replace('  ','')

In [None]:
iowa_liq_sales['Vendor Name'] = iowa_liq_sales['Vendor Name'].str.replace('COMPANY','CORP')

In [None]:
iowa_liq_sales['Vendor Name'].unique()

In [None]:
iowa_liq_sales['Vendor Name'].nunique()

In [None]:
# ITEM NUMBER

# getting rid of a letter
iowa_liq_sales['Item Number'] = iowa_liq_sales['Item Number'].str.replace('x904631','904631')

In [None]:
# changing item number to numeric

# error there's a floatna type
#iowa_liq_sales['Item Number'] = iowa_liq_sales['Item Number'].astype('int64')

In [None]:
# remove that one null entry by saving notna values
iowa_liq_sales = iowa_liq_sales[iowa_liq_sales['Item Number'].notna()]

In [None]:
iowa_liq_sales['Item Number'] = iowa_liq_sales['Item Number'].astype('int64')

In [None]:
iowa_liq_sales['Item Number'].nunique()

In [None]:
# ITEM DESCRIPTION

iowa_liq_sales['Item Description'].nunique()

In [None]:
# BOTTLE VOLUME (mL)
iowa_liq_sales['Bottle Volume (ml)'].nunique()

In [None]:
iowa_liq_sales['Bottle Volume (ml)'].unique()

In [None]:
# STATE BOTTLE COST
iowa_liq_sales['State Bottle Cost'].describe()

In [None]:
# STATE BOTTLE RETAIL
iowa_liq_sales['State Bottle Retail'].describe()

In [None]:
iowa_liq_sales['State Bottle Retail'].nunique()

In [None]:
# BOTTLES SOLD
iowa_liq_sales['Bottles Sold']

In [None]:
# SALE (DOLLARS)
iowa_liq_sales['Sale (Dollars)']

In [None]:
# VOLUME SOLD (LITERS)
iowa_liq_sales['Volume Sold (Liters)'].nunique()

In [None]:
iowa_liq_sales.head()

In [None]:
# save progress as csv

#iowa_liq_sales.to_csv('iowa_clean.csv', index=False)

In [None]:
# create internet sized sample
iowa_sample = iowa_liq_sales.sample(1000)

In [None]:
# save the sample as a csv

#iowa_sample.to_csv('iowa_sample.csv', index=False)