Now that we have finally preprocessed all of our data, it is time to load everything into our database. Because our dataset is rather large, I made the choice not to store everything in a single table, but rather store it based on the yearly data as we have been doing.

In [1]:
import pandas as pd
import glob
import mysql.connector

In [2]:
#get version 4 files
files_list = glob.glob('V4*.csv')

#read files into a dictionary of dataframes
data_frames = {file: pd.read_csv(file) for file in files_list}

Now we have to setup the connection to the mysql database and load in the data. Here, the data loading into the MySQL Database was separated into two cycles to avoid crashing the kernel

In [4]:

#connect to mysql server
connection = mysql.connector.connect(
    host="host",
    user="root",
    password="not_my_pass",
    database="mysql_database"
)

#cursor to execute SQL commmands
cursor = connection.cursor()

#get range of years to insert into database
#NOTE putting the full range here was liable to crash the kernel
year_range = range(2012, 2019)


for year in year_range:
    #get dataframe from dictionary based on year
    dataframe = data_frames[f'V4_Iowa_Liquor_Data_{year}.csv']


    #write query to create table in mysql based on year
    #NOTE: you CANNOT run this directly in the cursor.create command
    create_table_query = f''' CREATE TABLE IF NOT EXISTS liquor_sales_{year} (
            id INT AUTO_INCREMENT PRIMARY KEY,
            Date DATE,
            Store_Number INT,
            Store_Name VARCHAR(100),
            Address VARCHAR(255),
            City VARCHAR(100),
            Zip_Code INT,
            Store_Location VARCHAR(255),
            County_Number FLOAT,
            County VARCHAR(100),
            Category FLOAT,
            Category_Name VARCHAR(255),
            Vendor_Number FLOAT,
            Vendor_Name VARCHAR(255),
            Item_Number INT,
            Item_Description VARCHAR(255),
            Pack INT,
            Bottle_Volume_ml INT,
            State_Bottle_Cost FLOAT,
            State_Bottle_Retail FLOAT,
            Bottles_Sold INT,
            Sale_Dollars FLOAT,
            Volume_Sold_Liters FLOAT,
            Volume_Sold_Gallons FLOAT,
            Longitude FLOAT,
            Latitude FLOAT,
            Imports INT,
            Vodka INT,
            Whisky INT,
            Rum INT,
            Liqueur INT,
            Tequila INT,
            Gin INT,
            Brandy INT,
            Schnapps INT,
            Scotch INT,
            Specialty INT,
            Special_Order INT
        )'''





    #create liquor table using formatted string
    cursor.execute(create_table_query)

    #define the MySQL insertion query
    insert_query = f'''
        INSERT INTO liquor_sales_{year} (
            Date, Store_Number, Store_Name, Address, City, Zip_Code,
            Store_Location, County_Number, County, Category, Category_Name,
            Vendor_Number, Vendor_Name, Item_Number, Item_Description,
            Pack, Bottle_Volume_ml, State_Bottle_Cost, State_Bottle_Retail,
            Bottles_Sold, Sale_Dollars, Volume_Sold_Liters, Volume_Sold_Gallons,
            Longitude, Latitude, Imports, Vodka, Whisky, Rum, Liqueur,
            Tequila, Gin, Brandy, Schnapps, Scotch, Specialty, Special_Order
        )
        VALUES (
            %s, %s, %s, %s, %s, %s,
            %s, %s, %s, %s, %s,
            %s, %s, %s, %s,
            %s, %s, %s, %s,
            %s, %s, %s, %s,
            %s, %s, %s, %s, %s, %s, %s,
            %s, %s, %s, %s, %s, %s, %s
        )
    '''



    #convert 'Date' column to datetime and then to MySQL suitable format
    #NOTE: date must be in .strftime format to be useable in MySQL
    dataframe['Date'] = pd.to_datetime(dataframe['Date'], errors='coerce')
    dataframe['Date'] = dataframe['Date'].dt.strftime('%Y-%m-%d')

    #gather rows to be inserted into database
    data_tuples = [tuple(row) for row in dataframe.to_numpy()]

    #define a chunk size to insert tuples into database
    #NOTE: Not doing this will crash kernel
    chunk_size = 10000  # Adjust this as needed
    total_rows = len(data_tuples)
    inserted_rows = 0

    #loop to insert data into database
    for i in range(0, total_rows, chunk_size):
        chunk = data_tuples[i:i + chunk_size]
        cursor.executemany(insert_query, chunk)
        inserted_rows += len(chunk)
        print(f"Inserted {inserted_rows} out of {total_rows} rows for DataFrame V4_Iowa_Liquor_Data_{year}.csv.")

#commit the changes and close the connection
connection.commit()
connection.close()


Inserted 10000 out of 2066511 rows for DataFrame V4_Iowa_Liquor_Data_2012.csv.
Inserted 20000 out of 2066511 rows for DataFrame V4_Iowa_Liquor_Data_2012.csv.
Inserted 30000 out of 2066511 rows for DataFrame V4_Iowa_Liquor_Data_2012.csv.
Inserted 40000 out of 2066511 rows for DataFrame V4_Iowa_Liquor_Data_2012.csv.
Inserted 50000 out of 2066511 rows for DataFrame V4_Iowa_Liquor_Data_2012.csv.
Inserted 60000 out of 2066511 rows for DataFrame V4_Iowa_Liquor_Data_2012.csv.
Inserted 70000 out of 2066511 rows for DataFrame V4_Iowa_Liquor_Data_2012.csv.
Inserted 80000 out of 2066511 rows for DataFrame V4_Iowa_Liquor_Data_2012.csv.
Inserted 90000 out of 2066511 rows for DataFrame V4_Iowa_Liquor_Data_2012.csv.
Inserted 100000 out of 2066511 rows for DataFrame V4_Iowa_Liquor_Data_2012.csv.
Inserted 110000 out of 2066511 rows for DataFrame V4_Iowa_Liquor_Data_2012.csv.
Inserted 120000 out of 2066511 rows for DataFrame V4_Iowa_Liquor_Data_2012.csv.
Inserted 130000 out of 2066511 rows for DataFrame

In [5]:


#connect to mysql server
connection = mysql.connector.connect(
    host="host",
    user="root",
    password="not_my_pass",
    database="mysql_database"
)

#cursor to execute SQL commmands
cursor = connection.cursor()

#get range of years to insert into database
#NOTE putting the full range here was liable to crash the kernel
year_range = range(2019, 2024)


for year in year_range:
    #get dataframe from dictionary based on year
    dataframe = data_frames[f'V4_Iowa_Liquor_Data_{year}.csv']


    #write query to create table in mysql based on year
    #NOTE: you CANNOT run this directly in the cursor.create command
    create_table_query = f''' CREATE TABLE IF NOT EXISTS liquor_sales_{year} (
            id INT AUTO_INCREMENT PRIMARY KEY,
            Date DATE,
            Store_Number INT,
            Store_Name VARCHAR(100),
            Address VARCHAR(255),
            City VARCHAR(100),
            Zip_Code INT,
            Store_Location VARCHAR(255),
            County_Number FLOAT,
            County VARCHAR(100),
            Category FLOAT,
            Category_Name VARCHAR(255),
            Vendor_Number FLOAT,
            Vendor_Name VARCHAR(255),
            Item_Number INT,
            Item_Description VARCHAR(255),
            Pack INT,
            Bottle_Volume_ml INT,
            State_Bottle_Cost FLOAT,
            State_Bottle_Retail FLOAT,
            Bottles_Sold INT,
            Sale_Dollars FLOAT,
            Volume_Sold_Liters FLOAT,
            Volume_Sold_Gallons FLOAT,
            Longitude FLOAT,
            Latitude FLOAT,
            Imports INT,
            Vodka INT,
            Whisky INT,
            Rum INT,
            Liqueur INT,
            Tequila INT,
            Gin INT,
            Brandy INT,
            Schnapps INT,
            Scotch INT,
            Specialty INT,
            Special_Order INT
        )'''





    #create liquor table using formatted string
    cursor.execute(create_table_query)

    #define the MySQL insertion query
    insert_query = f'''
        INSERT INTO liquor_sales_{year} (
            Date, Store_Number, Store_Name, Address, City, Zip_Code,
            Store_Location, County_Number, County, Category, Category_Name,
            Vendor_Number, Vendor_Name, Item_Number, Item_Description,
            Pack, Bottle_Volume_ml, State_Bottle_Cost, State_Bottle_Retail,
            Bottles_Sold, Sale_Dollars, Volume_Sold_Liters, Volume_Sold_Gallons,
            Longitude, Latitude, Imports, Vodka, Whisky, Rum, Liqueur,
            Tequila, Gin, Brandy, Schnapps, Scotch, Specialty, Special_Order
        )
        VALUES (
            %s, %s, %s, %s, %s, %s,
            %s, %s, %s, %s, %s,
            %s, %s, %s, %s,
            %s, %s, %s, %s,
            %s, %s, %s, %s,
            %s, %s, %s, %s, %s, %s, %s,
            %s, %s, %s, %s, %s, %s, %s
        )
    '''



    #convert 'Date' column to datetime and then to MySQL suitable format
    #NOTE: date must be in .strftime format to be useable in MySQL
    dataframe['Date'] = pd.to_datetime(dataframe['Date'], errors='coerce')
    dataframe['Date'] = dataframe['Date'].dt.strftime('%Y-%m-%d')

    #gather rows to be inserted into database
    data_tuples = [tuple(row) for row in dataframe.to_numpy()]

    #define a chunk size to insert tuples into database
    #NOTE: Not doing this will crash kernel
    chunk_size = 10000  # Adjust this as needed
    total_rows = len(data_tuples)
    inserted_rows = 0

    #loop to insert data into database
    for i in range(0, total_rows, chunk_size):
        chunk = data_tuples[i:i + chunk_size]
        cursor.executemany(insert_query, chunk)
        inserted_rows += len(chunk)
        print(f"Inserted {inserted_rows} out of {total_rows} rows for DataFrame V4_Iowa_Liquor_Data_{year}.csv.")

#commit the changes and close the connection
connection.commit()
connection.close()


Inserted 10000 out of 2371873 rows for DataFrame V4_Iowa_Liquor_Data_2019.csv.
Inserted 20000 out of 2371873 rows for DataFrame V4_Iowa_Liquor_Data_2019.csv.
Inserted 30000 out of 2371873 rows for DataFrame V4_Iowa_Liquor_Data_2019.csv.
Inserted 40000 out of 2371873 rows for DataFrame V4_Iowa_Liquor_Data_2019.csv.
Inserted 50000 out of 2371873 rows for DataFrame V4_Iowa_Liquor_Data_2019.csv.
Inserted 60000 out of 2371873 rows for DataFrame V4_Iowa_Liquor_Data_2019.csv.
Inserted 70000 out of 2371873 rows for DataFrame V4_Iowa_Liquor_Data_2019.csv.
Inserted 80000 out of 2371873 rows for DataFrame V4_Iowa_Liquor_Data_2019.csv.
Inserted 90000 out of 2371873 rows for DataFrame V4_Iowa_Liquor_Data_2019.csv.
Inserted 100000 out of 2371873 rows for DataFrame V4_Iowa_Liquor_Data_2019.csv.
Inserted 110000 out of 2371873 rows for DataFrame V4_Iowa_Liquor_Data_2019.csv.
Inserted 120000 out of 2371873 rows for DataFrame V4_Iowa_Liquor_Data_2019.csv.
Inserted 130000 out of 2371873 rows for DataFrame