Now we can demonstrate the power of our approach by collecting data the entire set of Vokda sales data from nearly 99% of the liquor sales data in Iowa over the past 12 years. This will be done in a single jupyter cell and later output to a .txt file. This file can then easily be loaded into Tableau for further analysis.

In [1]:
import pandas as pd
import mysql.connector

In [2]:
#connect to mysql server
connection = mysql.connector.connect(
    host="host",
    user="root",
    password="not_my_pass",
    database="mysql_database"
)

#create a cursor to execute SQL commands
cursor = connection.cursor()

#initialize an empty list to store dfs
data_frames = []

#range of years to gather data, for this we will get the entire set of Vokda sales
year_range = range(2012, 2024)

#iterate through year in nrage
for year in year_range:

    #get name of liquor sales table in mysql database
    table_name = f"liquor_sales_{year}"

    #SQL query to select Vodka sales rows
    select_query = f"SELECT * FROM {table_name} WHERE Vodka = 1"

    #execute query
    cursor.execute(select_query)

    #gather rows that match the condition into list of tuples
    selected_rows = cursor.fetchall()

    #get column names from the cursor 
    column_names = [desc[0] for desc in cursor.description]

    #convert these rows to a df
    vodka_dataframe = pd.DataFrame(selected_rows, columns=column_names)

    #append df to the list
    data_frames.append(vodka_dataframe)

#concatenate all dfs into a single df
combined_vodka_df = pd.concat(data_frames, ignore_index=True)

#close cursor and connection
cursor.close()
connection.close()


In [3]:
len(combined_vodka_df)

6847248

In [4]:
combined_vodka_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6847248 entries, 0 to 6847247
Data columns (total 38 columns):
 #   Column               Dtype  
---  ------               -----  
 0   id                   int64  
 1   Date                 object 
 2   Store_Number         int64  
 3   Store_Name           object 
 4   Address              object 
 5   City                 object 
 6   Zip_Code             int64  
 7   Store_Location       object 
 8   County_Number        float64
 9   County               object 
 10  Category             float64
 11  Category_Name        object 
 12  Vendor_Number        float64
 13  Vendor_Name          object 
 14  Item_Number          int64  
 15  Item_Description     object 
 16  Pack                 int64  
 17  Bottle_Volume_ml     int64  
 18  State_Bottle_Cost    float64
 19  State_Bottle_Retail  float64
 20  Bottles_Sold         int64  
 21  Sale_Dollars         float64
 22  Volume_Sold_Liters   float64
 23  Volume_Sold_Gallons  float64
 24

In [5]:
combined_vodka_df.head()

Unnamed: 0,id,Date,Store_Number,Store_Name,Address,City,Zip_Code,Store_Location,County_Number,County,...,Whisky,Rum,Liqueur,Tequila,Gin,Brandy,Schnapps,Scotch,Specialty,Special_Order
0,2,2012-06-25,2353,CRESCO LIQUOR STORE,708 2ND AVE SE,CRESCO,52136,POINT (-92.106529 43.371131),45.0,HOWARD,...,0,0,0,0,0,0,0,0,0,0
1,3,2012-02-20,3695,MRS. B'S LIQUOR,623 S MARION ST,REMSEN,51050,POINT (-95.977022 42.808951),75.0,PLYMOUTH,...,0,0,0,0,0,0,0,0,0,0
2,4,2012-01-09,4226,HARTIG DRUG #12 / WAUKON,21 W MAIN ST,WAUKON,52172,POINT (-91.476658 43.269279),3.0,ALLAMAKEE,...,0,0,0,0,0,0,0,0,0,0
3,8,2012-04-09,4430,KUM & GO #87 / FORT DODGE,1601 FIFTH AVENUE S,FORT DODGE,50501,POINT (-94.175805 42.501112),94.0,WEBSTER,...,0,0,0,0,0,0,0,0,0,0
4,12,2012-10-29,3531,OSAGE LIQUORS,508 MAIN ST,OSAGE,50461,POINT (-92.814028 43.284116),66.0,MITCHELL,...,0,0,0,0,0,0,0,0,0,0


Since we don't need the boolean columns corresponding to non-vodka data, we can simply drop them.

In [6]:
#list of columns to drop
columns_to_drop = ['Whisky', 'Rum', 'Liqueur', 'Tequila', 'Gin', 
                   'Brandy', 'Schnapps', 'Scotch', 'Specialty', 'Special_Order']

#drop columns
final_vodka_df = combined_vodka_df.drop(columns=columns_to_drop)

In [7]:
final_vodka_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6847248 entries, 0 to 6847247
Data columns (total 28 columns):
 #   Column               Dtype  
---  ------               -----  
 0   id                   int64  
 1   Date                 object 
 2   Store_Number         int64  
 3   Store_Name           object 
 4   Address              object 
 5   City                 object 
 6   Zip_Code             int64  
 7   Store_Location       object 
 8   County_Number        float64
 9   County               object 
 10  Category             float64
 11  Category_Name        object 
 12  Vendor_Number        float64
 13  Vendor_Name          object 
 14  Item_Number          int64  
 15  Item_Description     object 
 16  Pack                 int64  
 17  Bottle_Volume_ml     int64  
 18  State_Bottle_Cost    float64
 19  State_Bottle_Retail  float64
 20  Bottles_Sold         int64  
 21  Sale_Dollars         float64
 22  Volume_Sold_Liters   float64
 23  Volume_Sold_Gallons  float64
 24

In [8]:
#export df to a tab separated text file
#NOTE: amount of data is too large to fit into a .xlsx sheet
final_vodka_df.to_csv('final_vodka_data.txt', sep='\t', index=False)