In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc  # Garbage collection. We will use it a lot.

from tqdm.notebook import trange, tqdm




In [None]:
import os
for dirname, _, filenames in os.walk('../input/iowa-liquor-sales'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def get_data_from_records():
    """
    This function may dowmload all records from Iowa.gov;
    but it is too slow so it times out. You can get full csv from the same site though.
    """
    # make sure to install these packages before running:
    !pip install sodapy

    from sodapy import Socrata

    # Unauthenticated client only works with public data sets. Note 'None'
    # in place of application token, and no username or password:
    client = Socrata("data.iowa.gov", None)

    # Example authenticated client (needed for non-public datasets):
    # client = Socrata(data.iowa.gov,
    #                  MyAppToken,
    #                  userame="user@example.com",
    #                  password="AFakePassword")

    # First 2000 results, returned as JSON from API / converted to Python list of
    # dictionaries by sodapy.
    results = client.get_all("m3tr-qhgy")

    # Convert to pandas DataFrame
    results_df = pd.DataFrame.from_records(results)
    
    return results_df

In [None]:
def basic_preprocessing(df):
    """
    Basic preprocessing of the original Iowa Liquor Sales dataframe:
    - cast all object values to lowercase
    - split names into meaningfull parts [Store_Name, Store_subname, Store_Number]
    
    NOTE: the latter part needs improvement:
    - some names have mulitple "/", need to eyeball such strings
    - some do not have # before the number, possible solution .str.extract(pat=r'(\d+$)')
    """
    # get object columns
    object_column_list = list(df.dtypes[df.dtypes == object].index)
    
    # cast all object values to lowercase
    for object_column in object_column_list:
        df.loc[:,object_column] = df.loc[:,object_column].str.lower().str.strip().str.split().str.join(' ')
        gc.collect()
    
    # split Store_Names to [Store_Name, Store_subname, Store_Number]
    df[['Store Name','Store Subname']] = df['Store Name'].str.rsplit(pat=" / ", expand=True, n=1)
    df[['Store Name','Store SubNumber']] = df['Store Name'].str.rsplit(pat=" #", expand=True, n=1)
    
    return df

def get_number_to_name_dict(df, name_to_number_dict):
    """
    For every column pair (name_column, id_column) in name_to_number_dict
    creates one-to-one mapping of id_number to longest_name_string.
    """
    def max_length_dict(df, number_column, name_column):
        """
        For given pair of (name_column, id_column) creates one-to-one mapping of id_number to longest_name_string.
        """
        # get all unique pairs of (id_number, name),
        # usually there are multiple names for every id
        stores_df = df.loc[:,[number_column, name_column]].drop_duplicates()

        # create dictionary to map id_number to longest name,
        # so that we can only keep the longest names
        number_to_name_dict = stores_df.fillna('#').groupby(number_column)[name_column].max().to_dict()

        return number_to_name_dict

    map_dict = {}
    
    # for every pair of id_column, name_column
    for name in name_to_number_dict:
        # add dict record: {name: number_to_name_dict}
        map_dict[name] = max_length_dict(df,name_to_number_dict[name],name)
    
    return map_dict

# dict of name_column, id_column pairs:
name_to_number_dict = {
    'Store Name':'Store Number',
    'Store Subname':'Store Number',
    'County':'County Number',
    'Vendor Name':'Vendor Number',
    'Category Name':'Category'
}

In [None]:
useful_columns = ['Invoice/Item Number', 'Date', 'Store Number', 'Store Name', 'Address',
       'City', 'Zip Code', 'County Number', 'County',
       'Category', 'Category Name', 'Vendor Number', 'Vendor Name',
       'Item Number', 'Item Description', 'Pack', 'Bottle Volume (ml)',
       'State Bottle Cost', 'State Bottle Retail', 'Bottles Sold',
       'Sale (Dollars)', 'Volume Sold (Liters)']

# Load raw dataset and sample 10% of it right away:
FILE_NAME = '../input/iowa-liquor-sales/Iowa_Liquor_Sales.csv'
raw_df = pd.read_csv(FILE_NAME, parse_dates=['Date'], usecols=useful_columns)

In [None]:
raw_df['Date'].max()

In [None]:
samples_idx = np.array_split(
    np.random.permutation(raw_df.shape[0]), # permutated index
    10 # number of parts to split index
)

# use index values from the first part:
raw_df = raw_df.iloc[samples_idx[0],:]
gc.collect()

In [None]:
raw_df = basic_preprocessing(raw_df)

# get a dict to map id_number to longest name
number_to_name_dict = get_number_to_name_dict(raw_df, name_to_number_dict)

# map names to longest names based on id_numbers
for name in name_to_number_dict:
    raw_df[name] = raw_df[name_to_number_dict[name]].map(number_to_name_dict[name])
    
# save the processed chunk
raw_df.to_csv('iowa_processed.csv')

In [None]:
for idx in tqdm(samples_idx[1:]):

    raw_df = pd.read_csv(
        FILE_NAME,
        parse_dates=['Date'],
        usecols=useful_columns
    )  # skiprows = lambda x: x not in np.append([0],idx)  # append column names row in the beginning 
    
    raw_df = raw_df.iloc[idx,:] # the lamda seemed to be faster but it is not
    gc.collect() # this might be unnecessary if using lambdas
    
    # same as above: lowercase, split, map using existing dicts, save_append
    raw_df = basic_preprocessing(raw_df)
    for name in name_to_number_dict:
        raw_df[name] = raw_df[name_to_number_dict[name]].map(number_to_name_dict[name])
    raw_df.to_csv('iowa_processed.csv', mode='a', header=False)

In [None]:
raw_df['month'] = pd.DatetimeIndex(raw_df['Date']).month
raw_df['year'] = pd.DatetimeIndex(raw_df['Date']).year

df_month_category = raw_df.groupby(['Category Name','year','month']).mean()[['Volume Sold (Liters)', 'Sale (Dollars)']]
df_month_category.reset_index(inplace=True)

iowa_temp_month_high_avg = [29.1,35.4,48.2,61.3,72.3,81.8,86.0,83.9,75.9,63.5,46.7,33.1]
iowa_temp_month_high_avg_dict = dict(zip(list(range(1,13)),iowa_temp_month_high_avg))
df_month_category['mnth_avg_temp'] = df_month_category.month.map(iowa_temp_month_high_avg_dict)
df_month_category

In [None]:
year2020 = df_month_category.groupby('year').month.describe()
year2020

In [None]:
df_year = df_month_category.groupby('year').mean().drop('month', axis = 1).drop('mnth_avg_temp', axis = 1)
df_year

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.rc('font', size=12)
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(df_month_category.year, df_month_category.Sale(Dollars))
ax.set_xlabel('year')
ax.set_ylabel('Sale (Dollars)')
ax.set_title('Sale (Dollars) through the years')
ax.grid(True)

In [None]:
minSale = df_year['Sale (Dollars)'].min()
if minSale == df_year.iloc[8,1]:
    print('Year 2020 has a lowest sale point for alcohol since 2012.')
else:
    print('Year 2020 does not has a lowest sale point for alcohol since 2012.')

In [None]:
category_names_list = list(df_month_category['Category Name'].unique())
for i, category_name in enumerate(category_names_list):
    category_mask = df_month_category['Category Name']==category_name
    pivot_data = df_month_category[category_mask].pivot('year','month','Sale (Dollars)')
    fig, ax = plt.subplots()
    fig.set_size_inches(12, pivot_data.shape[0])
    sns.heatmap(pivot_data).set_title(category_name.upper(),pad=20, fontdict={'fontsize': 20, 'fontweight': 'medium'})