In [1]:
#import dependencies
import pandas as pd
from sqlalchemy import create_engine
from fuzzywuzzy import fuzz, process
import numpy as np
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect, distinct, MetaData, Table



In [2]:
#read raw data
df=pd.read_csv('last_quarter_2018.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,invoice/item number,date,store number,store name,address,city,zip code,store location,county number,...,item number,item description,pack,bottle volume (ml),state bottle cost,state bottle retail,bottles sold,sale (dollars),volume sold (liters),volume sold (gallons)
0,1957252,INV-14868800024,2018-10-05,4102,Fareway Stores #386 / Ames,619 Burnett Ave,Ames,50010.0,"619 Burnett Ave\nAmes 50010\n(42.027426, -93.6...",85.0,...,19066,Jim Beam,12,750,10.5,15.75,6,94.44,4.5,1.18
1,1957642,INV-14868800025,2018-10-05,4102,Fareway Stores #386 / Ames,619 Burnett Ave,Ames,50010.0,"619 Burnett Ave\nAmes 50010\n(42.027426, -93.6...",85.0,...,26826,Jack Daniels Old #7 Black Lbl,12,750,15.57,23.36,3,70.08,2.25,0.59
2,1958233,INV-14868800026,2018-10-05,4102,Fareway Stores #386 / Ames,619 Burnett Ave,Ames,50010.0,"619 Burnett Ave\nAmes 50010\n(42.027426, -93.6...",85.0,...,86670,Jack Daniels Tennessee Honey,12,750,15.57,23.36,2,46.72,1.5,0.39
3,1960866,INV-14868800027,2018-10-05,4102,Fareway Stores #386 / Ames,619 Burnett Ave,Ames,50010.0,"619 Burnett Ave\nAmes 50010\n(42.027426, -93.6...",85.0,...,37996,Smirnoff 80prf,12,750,8.25,12.38,6,74.28,4.5,1.18
4,1961241,INV-14868800028,2018-10-05,4102,Fareway Stores #386 / Ames,619 Burnett Ave,Ames,50010.0,"619 Burnett Ave\nAmes 50010\n(42.027426, -93.6...",85.0,...,17206,Cedar Ridge Bourbon,6,750,18.1,27.15,3,81.45,2.25,0.59


In [3]:
#get shape of database
df.shape

(820406, 25)

In [4]:
#drop old index
df=df.drop(['Unnamed: 0'], axis=1)

In [5]:
#change column headers to replace spaces with '_'
columns=df.columns
new_columns=[]
for column in columns:
    new_columns.append(column.replace(" ", "_"))

In [6]:
#rename column headers with column headers with '_'
df.columns=new_columns
df.head()

Unnamed: 0,invoice/item_number,date,store_number,store_name,address,city,zip_code,store_location,county_number,county,...,item_number,item_description,pack,bottle_volume_(ml),state_bottle_cost,state_bottle_retail,bottles_sold,sale_(dollars),volume_sold_(liters),volume_sold_(gallons)
0,INV-14868800024,2018-10-05,4102,Fareway Stores #386 / Ames,619 Burnett Ave,Ames,50010.0,"619 Burnett Ave\nAmes 50010\n(42.027426, -93.6...",85.0,STORY,...,19066,Jim Beam,12,750,10.5,15.75,6,94.44,4.5,1.18
1,INV-14868800025,2018-10-05,4102,Fareway Stores #386 / Ames,619 Burnett Ave,Ames,50010.0,"619 Burnett Ave\nAmes 50010\n(42.027426, -93.6...",85.0,STORY,...,26826,Jack Daniels Old #7 Black Lbl,12,750,15.57,23.36,3,70.08,2.25,0.59
2,INV-14868800026,2018-10-05,4102,Fareway Stores #386 / Ames,619 Burnett Ave,Ames,50010.0,"619 Burnett Ave\nAmes 50010\n(42.027426, -93.6...",85.0,STORY,...,86670,Jack Daniels Tennessee Honey,12,750,15.57,23.36,2,46.72,1.5,0.39
3,INV-14868800027,2018-10-05,4102,Fareway Stores #386 / Ames,619 Burnett Ave,Ames,50010.0,"619 Burnett Ave\nAmes 50010\n(42.027426, -93.6...",85.0,STORY,...,37996,Smirnoff 80prf,12,750,8.25,12.38,6,74.28,4.5,1.18
4,INV-14868800028,2018-10-05,4102,Fareway Stores #386 / Ames,619 Burnett Ave,Ames,50010.0,"619 Burnett Ave\nAmes 50010\n(42.027426, -93.6...",85.0,STORY,...,17206,Cedar Ridge Bourbon,6,750,18.1,27.15,3,81.45,2.25,0.59


In [7]:
# rename columns with units in brackets
df=df.rename(columns={'bottle_volume_(ml)':'bottle_volume_ml',
                   'sale_(dollars)': 'sale_dollars',
                   'volume_sold_(liters)': 'volume_sold_liters',
                   'volume_sold_(gallons)': 'volume_sold_gallons'})

In [8]:
#import data on Iowa counties to get FIPS county id number (used for geojson)
df_counties=pd.read_csv('Iowa_county_FIPS_numbers.csv')
df_counties=df_counties.rename(columns={'FIPS Code': 'FIPS_county_code',
                            'County Name': 'county_name'})
df_counties.head()

Unnamed: 0,FIPS_county_code,county_name
0,19001,Adair County
1,19003,Adams County
2,19005,Allamakee County
3,19007,Appanoose County
4,19009,Audubon County


In [9]:
#remove 'County' from county names
new=df_counties['county_name'].str.split(' County', n=1, expand=True)
df_counties['county_name']=new[0]
df_counties.head()

Unnamed: 0,FIPS_county_code,county_name
0,19001,Adair
1,19003,Adams
2,19005,Allamakee
3,19007,Appanoose
4,19009,Audubon


In [10]:
#use string matching to match county names between data table and county table with cut off score of 80
temp_list=[]
county_names=df_counties['county_name'].tolist()
for county in df['county'].unique().tolist():
    try:
        result, score = process.extractOne(county, county_names, scorer=fuzz.token_set_ratio, score_cutoff=80)
        temp_list.append({'data_county': county, 'county_name': result, 'Score':score})
    except:
        pass
county_fix = pd.DataFrame(temp_list)
county_fix

# create dictionary for county corrections
county_corrections=county_fix.set_index('data_county').to_dict()['county_name']
county_corrections

#replace county in data table with correct county names
for k,v in county_corrections.items():
    df['county'].loc[df["county"]==k] = v

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [11]:
#merge raw data and county information
df_liquor_sales=pd.merge(df, df_counties, left_on='county', right_on='county_name', how='left')

In [12]:
#drop old county name from raw data
df_liquor_sales=df_liquor_sales.drop(['county'], axis=1)

In [13]:
#view dataframe
df_liquor_sales.head()

Unnamed: 0,invoice/item_number,date,store_number,store_name,address,city,zip_code,store_location,county_number,category,...,pack,bottle_volume_ml,state_bottle_cost,state_bottle_retail,bottles_sold,sale_dollars,volume_sold_liters,volume_sold_gallons,FIPS_county_code,county_name
0,INV-14868800024,2018-10-05,4102,Fareway Stores #386 / Ames,619 Burnett Ave,Ames,50010.0,"619 Burnett Ave\nAmes 50010\n(42.027426, -93.6...",85.0,1011200.0,...,12,750,10.5,15.75,6,94.44,4.5,1.18,19169.0,Story
1,INV-14868800025,2018-10-05,4102,Fareway Stores #386 / Ames,619 Burnett Ave,Ames,50010.0,"619 Burnett Ave\nAmes 50010\n(42.027426, -93.6...",85.0,1011400.0,...,12,750,15.57,23.36,3,70.08,2.25,0.59,19169.0,Story
2,INV-14868800026,2018-10-05,4102,Fareway Stores #386 / Ames,619 Burnett Ave,Ames,50010.0,"619 Burnett Ave\nAmes 50010\n(42.027426, -93.6...",85.0,1011400.0,...,12,750,15.57,23.36,2,46.72,1.5,0.39,19169.0,Story
3,INV-14868800027,2018-10-05,4102,Fareway Stores #386 / Ames,619 Burnett Ave,Ames,50010.0,"619 Burnett Ave\nAmes 50010\n(42.027426, -93.6...",85.0,1031100.0,...,12,750,8.25,12.38,6,74.28,4.5,1.18,19169.0,Story
4,INV-14868800028,2018-10-05,4102,Fareway Stores #386 / Ames,619 Burnett Ave,Ames,50010.0,"619 Burnett Ave\nAmes 50010\n(42.027426, -93.6...",85.0,1011200.0,...,6,750,18.1,27.15,3,81.45,2.25,0.59,19169.0,Story


In [14]:
#add month column
months=[]
for i in df_liquor_sales['date'].tolist():
    if (i.split('-')[1])=='10':
        months.append('Oct 2018')
    elif (i.split('-')[1])=='11':
        months.append('Nov 2018')
    elif (i.split('-')[1])=='12':
        months.append('Dec 2018')
    elif (i.split('-')[1])=='01':
        months.append('Jan 2019')
df_liquor_sales['month']=months

In [15]:
#add primary key
df_liquor_sales['id']=df_liquor_sales.index+1

In [16]:
#load raw data into dev database
engine_mysql = create_engine('mysql://root:root@localhost/iowa_alcohol?charset=utf8')
df_liquor_sales.to_sql('last_quarter', con=engine_mysql, if_exists='replace', index=False)
engine_mysql.execute('ALTER TABLE `last_quarter` ADD PRIMARY KEY (`id`);')

<sqlalchemy.engine.result.ResultProxy at 0x1ec80e382b0>

In [24]:
#transform 'last_quarter' table from dev environment for loading into production database
#create session with mysql database
session_mysql = Session(engine_mysql)
inspector_mysql = inspect(engine_mysql)

# reflect 'last quarter' database into a new model
Base=automap_base()
# reflect the tables
Base.prepare(engine_mysql, reflect=True)

In [18]:
Base.classes.keys()

['last_quarter']

In [19]:
#populate it in a variable
last_quarter=Base.classes.last_quarter

In [25]:
#view column headers
inspector_mysql=inspect(engine_mysql)
columns=inspector_mysql.get_columns('last_quarter')
for column in columns:
    print(column['name'], column['type'])

invoice/item_number TEXT
date TEXT
store_number BIGINT(20)
store_name TEXT
address TEXT
city TEXT
zip_code DOUBLE
store_location TEXT
county_number DOUBLE
category DOUBLE
category_name TEXT
vendor_number DOUBLE
vendor_name TEXT
item_number BIGINT(20)
item_description TEXT
pack BIGINT(20)
bottle_volume_ml BIGINT(20)
state_bottle_cost DOUBLE
state_bottle_retail DOUBLE
bottles_sold BIGINT(20)
sale_dollars DOUBLE
volume_sold_liters DOUBLE
volume_sold_gallons DOUBLE
FIPS_county_code DOUBLE
county_name TEXT
month TEXT
id BIGINT(20)


In [26]:
sel=[last_quarter.category_name,
     func.sum(last_quarter.bottles_sold)]

#query to get the liquor categories that have the least amount of bottles sold
results=session_mysql.query(*sel).\
        group_by(last_quarter.category_name).\
        order_by(func.sum(last_quarter.bottles_sold)).limit(33).all()

In [27]:
#put the liquor categories that sold the least bottles in a list
categories_to_remove=[]
for r in results:
    categories_to_remove.append(r[0])
    
categories_to_remove

['Imported Distilled Spirits Specialty',
 'Cocktails / RTD',
 'American Vodka',
 'Imported Cordials & Liqueur',
 'American Cordials & Liqueurs',
 'American Distilled Spirits Specialty',
 'American Sloe Gins',
 'Mezcal',
 'Iowa Distillery Whiskies',
 'Bottled in Bond Bourbon',
 'Flavored Gin',
 'Corn Whiskies',
 'Imported Vodka',
 'Neutral Grain Spirits Flavored',
 'Single Barrel Bourbon Whiskies',
 None,
 'American Distilled Spirit Specialty',
 'Aged Dark Rum',
 'Neutral Grain Spirits',
 'Special Order Items',
 'Gold Rum',
 'Single Malt Scotch',
 'Straight Rye Whiskies',
 'Coffee Liqueurs',
 'Imported Distilled Spirit Specialty',
 'Triple Sec',
 'Imported Dry Gins',
 'Scotch Whiskies',
 'American Cordials & Liqueur',
 'Imported Schnapps',
 'Irish Whiskies',
 'Cocktails /RTD',
 'Imported Flavored Vodka']

In [28]:
#write a query to sum bottles sold, sale, and volume sold grouped by month, county, and liquor category
sel2 = [last_quarter.month,
       last_quarter.category_name,
       func.sum(last_quarter.bottles_sold),
       func.sum(last_quarter.sale_dollars),
       func.sum(last_quarter.volume_sold_liters),
       last_quarter.FIPS_county_code,
       last_quarter.county_name]

results2 = session_mysql.query(*sel2).\
          group_by(last_quarter.month, last_quarter.county_name, last_quarter.category_name).all()

print(results2)

[('Oct 2018', 'Straight Bourbon Whiskies', Decimal('4143'), Decimal('74656.3800000000'), Decimal('3518.0600000000'), Decimal('19169.0000000000'), 'Story'), ('Oct 2018', 'Tennessee Whiskies', Decimal('2167'), Decimal('57302.7800000000'), Decimal('1986.5900000000'), Decimal('19169.0000000000'), 'Story'), ('Oct 2018', 'American Vodkas', Decimal('15381'), Decimal('144227.8999999998'), Decimal('15567.5500000000'), Decimal('19169.0000000000'), 'Story'), ('Oct 2018', 'Imported Flavored Vodka', Decimal('926'), Decimal('17845.0800000000'), Decimal('775.8400000000'), Decimal('19169.0000000000'), 'Story'), ('Oct 2018', 'Imported Vodkas', Decimal('1613'), Decimal('30462.6000000000'), Decimal('1617.8900000000'), Decimal('19169.0000000000'), 'Story'), ('Oct 2018', 'Straight Rye Whiskies', Decimal('406'), Decimal('10028.4000000000'), Decimal('303.8700000000'), Decimal('19169.0000000000'), 'Story'), ('Oct 2018', 'Whiskey Liqueur', Decimal('4372'), Decimal('54637.0400000000'), Decimal('3317.6600000000'

In [29]:
#put results from query into lists
month=[]
category=[]
total_bottle_sold=[]
total_sale=[]
total_volume_l=[]
county_code=[]
county_name=[]


for r in results2:
    month.append(r[0])
    category.append(r[1])
    total_bottle_sold.append(r[2])
    total_sale.append(r[3])
    total_volume_l.append(r[4])
    county_code.append(r[5])
    county_name.append(r[6])

In [30]:
#put results in dataframe
df = pd.DataFrame(data={'month': month,
                        'category': category,
                        'total_bottle_sold': total_bottle_sold,
                        'total_sale': total_sale,
                        'total_volume_l': total_volume_l,
                        'county_code': county_code,
                        'county_name': county_name})

In [31]:
#remove liquor categories that had the fewest bottles sold
df=df[~df['category'].isin(categories_to_remove)]

In [32]:
#ensure that dataframe is less than 10,000 rows
df.shape

(7759, 7)

In [33]:
#ensure that only top 20 liquor categories are present in the dataframe
len(df['category'].unique())

20

In [34]:
#create an id
df['id']=df.index+1

In [35]:
df.head()

Unnamed: 0,month,category,total_bottle_sold,total_sale,total_volume_l,county_code,county_name,id
0,Oct 2018,Straight Bourbon Whiskies,4143,74656.38,3518.06,19169.0,Story,1
1,Oct 2018,Tennessee Whiskies,2167,57302.78,1986.59,19169.0,Story,2
2,Oct 2018,American Vodkas,15381,144227.8999999998,15567.55,19169.0,Story,3
4,Oct 2018,Imported Vodkas,1613,30462.6,1617.89,19169.0,Story,5
6,Oct 2018,Whiskey Liqueur,4372,54637.04,3317.66,19169.0,Story,7


In [36]:
#load into prod environment (local computer)
engine_postgres = create_engine('postgresql://postgres:root@localhost:5432/IowaAlcohol')
df.to_sql('summary', con=engine_postgres, if_exists='replace', index=False)
engine_postgres.execute('ALTER TABLE summary ADD PRIMARY KEY (id);')

<sqlalchemy.engine.result.ResultProxy at 0x1ec810782b0>

In [37]:
#load into prod environment (heroku postgres)
engine_heroku=create_engine('postgres://imguojbpzabmit:48f1b95385566f8b4e57d227ed344a2644ae16ef78f8a72a4b96e9fddcb26ab7@ec2-54-243-128-95.compute-1.amazonaws.com:5432/d73fss4tvu5upc')
df.to_sql('summary', con=engine_heroku, if_exists='replace', index=False)
engine_heroku.execute('ALTER TABLE summary ADD PRIMARY KEY (id);')

<sqlalchemy.engine.result.ResultProxy at 0x1ec811d2898>