In [1]:
import pandas as pd
import numpy as np
import sqlite3
pd.set_option('display.max_columns', None)

In [31]:
raw = pd.read_csv('data/Iowa_Liquor_Sales.csv',
                       dtype={'Zip Code': object, 'Item Number': object}, 
                       parse_dates=['Date'])

### Store Data Frame

In [32]:
# Store df
store = raw.loc[:,['Store Number', 'Store Name', 'Address', 'City', 'Zip Code', 'Store Location', 'County Number', 'County']]
# lower case name
store.loc[:,'Store Name'] = store['Store Name'].str.lower()
store.loc[:,'Address'] = store['Address'].str.lower()
store.loc[:,'City'] = store['City'].str.lower()
store.loc[:,'County'] = store['County'].str.lower()

In [33]:
# replace null with string so the NaN is not contagious
store.replace(np.nan, 'missing', inplace = True)

##### Deal with varying store names

In [34]:
# Groupby store number and value count to get number of times store name matches to store number
store_name = store.groupby(['Store Number']).agg({'Store Name': 
                                                  'value_counts'}).rename(columns = {'Store Name': 
                                                                                     'Name Count'}).reset_index()

In [35]:
# rank the store names within each store number
store_name['Rank'] = store_name.groupby('Store Number')['Name Count'].rank(ascending = False)

In [36]:
# pick most frequent name
store_name_unique = store_name.loc[store_name.Rank == 1].drop(['Name Count', 
                                                               'Rank'], axis=1)

# drop columns from store
store = store.drop(['Store Name'], axis=1)

# join the ranked data to store df
store = pd.merge(store, store_name_unique, how='left', 
                 on='Store Number')

#### Deal with varying locations for each store number

In [37]:
# Business could have moved over the years
store_loc = store.groupby(['Store Number', 
                           'County', 'County Number', 'City', 
                           'Zip Code', 'Store Location']).agg({'Address': 
                                                               'value_counts'}).rename(columns = {'Address': 
                                                                                     'Address Count'}).reset_index()

In [38]:
# rank all location info within each store number
store_loc['Rank'] = store_loc.groupby('Store Number')['Address Count'].rank(ascending = False)

In [39]:
# pick most frequent name
store_loc_unique = store_loc.loc[store_loc.Rank == 1].drop(['Address Count', 
                                                            'Rank'], axis=1)

# drop columns from store
store = store.drop(['Address','City', 'Zip Code', 'Store Location', 
                    'County Number', 'County'], axis=1)

# join the ranked data to store df
store = pd.merge(store, store_loc_unique, how='left', 
                 on='Store Number')

In [40]:
# Drop duplicates
store = store.drop_duplicates()

## Adding Store Types

In [41]:
# Add column of zeros
store['Store Type'] = np.zeros(store.shape[0])

#### Categorize stores with string contains 
- order matters here

In [42]:
store['Store Type'].loc[store['Store Name'].str.contains(
    "food|market|super valu|saver|groc")] = 'Other Grocery or Convenience'

store['Store Type'].loc[store['Store Name'].str.contains(
    "econ-o-mart|mart|quik|pit stop|quick|kwik|general store|convenience|gas|circle k|petro|stop|casey's|country store|yesway|kum|7-eleven|station|express|fill r up|fuel|new star| go |the boonedocks")] = 'Convenience Store'

store['Store Type'].loc[store['Store Name'].str.contains(
    "hy-vee|wal-mart|fareway store|super mar|big g|sac city food pride|sam's club|supermarket|shop n save|grocery|target|dahl's|costco|whole foods|jeff's|hometown|mepo")] = 'Supermarket'

store['Store Type'].loc[store['Store Name'].str.contains(
    "liquor|spirits|tobacco|beverage|smoke|bottle|distil|wine|bootleg|northside one stop|beer|cigar|distrib|booze|brew|snuff|sauce")] = 'Liquor Tobacco Store'

store['Store Type'].loc[store['Store Name'].str.contains(
    "walgreens|cvs|drug")] = 'Drug Store'

store['Store Type'].loc[store['Store Name'].str.contains(
    "casino")] = 'Casino'

store['Store Type'].loc[store['Store Type']==0] = 'Other'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


### Liquor Price Data Frame

In [22]:
price = raw[['Item Number', 'Date', 'State Bottle Retail', 'State Bottle Cost']]

In [23]:
price = price.groupby(['Item Number', 'Date']).agg({'State Bottle Retail':
                                            'mean','State Bottle Cost':'mean' }).reset_index()

In [30]:
price = price.drop_duplicates

In [37]:
price.head()

Unnamed: 0,Item Number,Date,State Bottle Retail,State Bottle Cost
0,100001,2016-10-03,12.0,8.0
1,100001,2016-10-04,12.0,8.0
2,100001,2016-10-05,12.0,8.0
3,100001,2016-10-06,12.0,8.0
4,100001,2016-10-07,12.0,8.0


### SQLite

In [38]:
price.dtypes

Item Number                    object
Date                   datetime64[ns]
State Bottle Retail           float64
State Bottle Cost             float64
dtype: object

# Xiaogang, this is the table that needs two keys, and I am not sure how to do this.

In [61]:
# connect to sqlite
conn = sqlite3.connect('liquor.db')
# make a cursor
cursor = conn.cursor()
# make tables
sql_query = '''CREATE TABLE product_price(
                   'Item Number' varchar(50),
                   'Date' date,
                   'State Bottle Retail' float64,
                   'State Bottle Cost' float64,
                   PRIMARY KEY('Item Number', 'Date'));'''
# sql_query = 'DROP TABLE product_price;'
cursor.execute(sql_query)

cursor.close()
conn.close()

#### Add contents to product_place table

In [14]:
# connect to sqlite
conn = sqlite3.connect('liquor.db')
# make a cursor
cursor = conn.cursor()
# insert data
price.to_sql('product_price',conn,if_exists='append',index=False)
#cursor.executemany("INSERT INTO vendors (Vendor_number,Vendor_name) values (?,?)",Vendors_final)
#conn.commit()

cursor.close()
conn.close()

NameError: name 'price' is not defined

### Stores table in Sqlite

In [48]:
store.dtypes

Store Number       int64
Store Name        object
County            object
County Number     object
City              object
Zip Code          object
Store Location    object
Address           object
Store Type        object
dtype: object

In [49]:
# connect to sqlite
conn = sqlite3.connect('liquor.db')
# make a cursor
cursor = conn.cursor()
# make tables
sql_query = '''CREATE TABLE stores(
                   'Store Number' ITEGER PRIMARY KEY,
                   'Store Name' varchar(250),
                   'County' varchar(250),
                   'County Number' varchar(250),
                   'City' varchar(250),
                   'Zip Code' varchar(250),
                   'Store Location' varchar(250),
                   'Address' varchar(250),
                   'Store Type' varchar(250));'''
# sql_query = 'DROP TABLE stores;'
cursor.execute(sql_query)


cursor.close()
conn.close()

In [50]:
# connect to sqlite
conn = sqlite3.connect('liquor.db')
# make a cursor
cursor = conn.cursor()
# insert data
store.to_sql('stores',conn,if_exists='append',index=False)
#cursor.executemany("INSERT INTO vendors (Vendor_number,Vendor_name) values (?,?)",Vendors_final)
#conn.commit()

cursor.close()
conn.close()

IntegrityError: UNIQUE constraint failed: stores.Store Number