# Installs

In [1]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install SQLAlchemy

Note: you may need to restart the kernel to use updated packages.


# Imports

In [1]:
import pandas as pd
import numpy as np
import os
from configparser import ConfigParser
from sqlalchemy import create_engine, text

# Data Load and Preparation

In [2]:
# Set variable parameters
columns_target = ['5-digit fips code', 'release year', 'premature death raw value']
columns_rename = ['location_id', 'year', 'premature_death_raw']
columns_datatype = 'float'
county_prefix = '0001'

# Database parameters
database_table_name = 'premature_death_0000'
database_columns = ['id'] + columns_rename
database_datatypes = 'SERIAL', 'TEXT', 'VARCHAR(4)', 'REAL'
database_constraints = ['PRIMARY KEY', 'NOT NULL', 'NOT NULL', 'NOT NULL']
foreign_key_table = 'location'
foreign_key_home = 'location_id'
foreign_key_away = 'location_id'

In [3]:
# Create a list of file paths
file_paths = [
    r"../data/CHR/analytic_data2010.csv",
    r"../data/CHR/analytic_data2011.csv",
    r"../data/CHR/analytic_data2012.csv",
    r"../data/CHR/analytic_data2013.csv",
    r"../data/CHR/analytic_data2014.csv",
    r"../data/CHR/analytic_data2015.csv",
    r"../data/CHR/analytic_data2016.csv",
    r"../data/CHR/analytic_data2017.csv",
    r"../data/CHR/analytic_data2018_0.csv",
    r"../data/CHR/analytic_data2019.csv",
    r"../data/CHR/analytic_data2020_0.csv",
    r"../data/CHR/analytic_data2021.csv",
    r"../data/CHR/analytic_data2022.csv",
    r"../data/CHR/analytic_data2023_0.csv"
]

# Create a list of dataframes by reading each file
list_df_initial = [pd.read_csv(file_path, low_memory=False) for file_path in file_paths]
print("Dataframe Loaded")

Dataframe Loaded


In [4]:
# Drop the unneeded 2nd header row
# Reset the index after removing the unneeded 2nd header row
# Set all header text to lowercase


for index in range(len(list_df_initial)):
    list_df_initial[index] = list_df_initial[index].drop(index=0)
    list_df_initial[index].reset_index(drop=True, inplace=True)
    list_df_initial[index].columns = list_df_initial[index].columns.str.lower()
    print(f"{list_df_initial[index]['release year'].iloc[1]}: 2nd Row Dropped")
    print(f"{list_df_initial[index]['release year'].iloc[1]}: Index Reset")
    print(f"{list_df_initial[index]['release year'].iloc[1]}: Headers Set to Lowercase")
    print("_______________________________________________________")

2010: 2nd Row Dropped
2010: Index Reset
2010: Headers Set to Lowercase
_______________________________________________________
2011: 2nd Row Dropped
2011: Index Reset
2011: Headers Set to Lowercase
_______________________________________________________
2012: 2nd Row Dropped
2012: Index Reset
2012: Headers Set to Lowercase
_______________________________________________________
2013: 2nd Row Dropped
2013: Index Reset
2013: Headers Set to Lowercase
_______________________________________________________
2014: 2nd Row Dropped
2014: Index Reset
2014: Headers Set to Lowercase
_______________________________________________________
2015: 2nd Row Dropped
2015: Index Reset
2015: Headers Set to Lowercase
_______________________________________________________
2016: 2nd Row Dropped
2016: Index Reset
2016: Headers Set to Lowercase
_______________________________________________________
2017: 2nd Row Dropped
2017: Index Reset
2017: Headers Set to Lowercase
________________________________________

# Data Inspection

In [5]:
# Show dataframe sizes
for df in list_df_initial:
    print(f"{df['release year'].iloc[1]}: {df.shape}")

2010: (3193, 198)
2011: (3193, 379)
2012: (3193, 330)
2013: (3193, 466)
2014: (3193, 487)
2015: (3193, 462)
2016: (3193, 482)
2017: (3195, 492)
2018: (3194, 508)
2019: (3194, 534)
2020: (3194, 786)
2021: (3194, 690)
2022: (3194, 725)
2023: (3194, 720)


In [6]:
# Print column NaN count
for index in range(len(list_df_initial)):
    print(f"<{list_df_initial[index]['release year'].iloc[1]}>")
    for col in columns_target:
        try:
            print(f"NaN {col}: {list_df_initial[index][col].isna().sum()}")
        except:
            print(f"----------{col} NOT FOUND----------")
    print("__________________________________________")

<2010>
NaN 5-digit fips code: 0
NaN release year: 0
NaN premature death raw value: 99
__________________________________________
<2011>
NaN 5-digit fips code: 0
NaN release year: 0
NaN premature death raw value: 95
__________________________________________
<2012>
NaN 5-digit fips code: 0
NaN release year: 0
NaN premature death raw value: 71
__________________________________________
<2013>
NaN 5-digit fips code: 0
NaN release year: 0
NaN premature death raw value: 69
__________________________________________
<2014>
NaN 5-digit fips code: 0
NaN release year: 0
NaN premature death raw value: 69
__________________________________________
<2015>
NaN 5-digit fips code: 0
NaN release year: 0
NaN premature death raw value: 69
__________________________________________
<2016>
NaN 5-digit fips code: 0
NaN release year: 0
NaN premature death raw value: 66
__________________________________________
<2017>
NaN 5-digit fips code: 1
NaN release year: 7
NaN premature death raw value: 72
___________

# Transform Data by Table

In [7]:
# Create an empty list to store the reduced DataFrames
reduced_df_list = []

# Loop through each dataframe in list_df_initial and create a new list of dataframes with the selected columns
for index in range(len(list_df_initial)):
    reduced_df = list_df_initial[index][columns_target]
    reduced_df_list.append(reduced_df)
print("Reduced Dataframe Created")

Reduced Dataframe Created


In [8]:
%%capture --no-stdout
# Create a dictionary for renaming columns
rename_dict = dict(zip(columns_target, columns_rename))

# Rename the columns
for index in range(len(reduced_df_list)):
    # Rename columns in a dataframe using a dictionary
    reduced_df_list[index].rename(columns=rename_dict, inplace=True)    
print("Columns Renamed")

Columns Renamed


In [9]:
%%capture --no-stdout
# Drop rows where 'location_id' is NaN
for index in range(len(reduced_df_list)):
    reduced_df_list[index].dropna(subset=['location_id'], inplace=True)
print("NaN location_id Dropped")

NaN location_id Dropped


In [10]:
%%capture --no-stdout
# Iterate over each dataframe in the list to get the mode of the 'year' column, and fill NaN values with the mode.
for df in reduced_df_list:
    # Calculate the mode of the 'year' column
    mode_year = df['year'].mode()[0]
    
    # Fill NaN values in the 'year' column with the mode value
    df['year'].fillna(mode_year, inplace=True)

print("NaN values in the 'year' column filled with the mode value.")

NaN values in the 'year' column filled with the mode value.


In [11]:
%%capture --no-stdout
# Adds a country code refix to the area_id
for index in range(len(reduced_df_list)):
    reduced_df_list[index]['location_id'] = county_prefix + reduced_df_list[index]['location_id'] 
print("Prefix Added")

Prefix Added


In [12]:
#Check Prefix
for index, value in enumerate(reduced_df_list[0]["location_id"]):
    if index == 5:
        break
    else:
        print(value)

000100000
000101000
000101001
000101003
000101005


In [13]:
# Concatenate all DataFrames in the list
dfCombined = pd.concat(reduced_df_list, axis=0, join='inner')
print("Dataframes Combined")

Dataframes Combined


In [14]:
# Replace NaN values with 0
dfCombined.fillna(0, inplace=True)

# Print a success message
print("All NaN values have been replaced with 0.")

All NaN values have been replaced with 0.


In [15]:
# Sort and reset index
dfCombined = dfCombined.sort_values(by=['location_id', 'year'])
dfCombined.reset_index(drop=True, inplace=True)
print("Sorting and Index Reset Complete")

Sorting and Index Reset Complete


In [16]:
# Set column datatypes
dfCombined['location_id'] = dfCombined['location_id'].astype(str)
dfCombined['year'] = dfCombined['year'].astype(str)
dfCombined[columns_rename[2]] = dfCombined[columns_rename[2]].astype(columns_datatype)
print("Column Types Set")

Column Types Set


In [17]:
print(dfCombined.shape)

(44709, 3)


In [18]:
# Print column NaN count
for col in columns_rename:
    try:
        print(f"NaN {col}: {dfCombined[col].isna().sum()}")
    except:
        print(f"----------{col} NOT FOUND----------")
print("__________________________________________")

NaN location_id: 0
NaN year: 0
NaN premature_death_raw: 0
__________________________________________


# Save Dataframe as CSV

In [21]:
# Define the path where the CSV file will be saved
folder_path = '../test_folder'
file_path = os.path.join(folder_path, 'dataset_combined.csv')

# Check if the folder exists, if not, create it
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Save the DataFrame as a CSV file in the specified location
dfCombined.to_csv(file_path, index=False)

# Print a success message
print(f"The DataFrame has been saved as a CSV file at {file_path}")

The DataFrame has been saved as a CSV file at ../test_folder\dataset_combined.csv


# Upload to Database

In [19]:
# Read database.ini
filename='../database.ini'
section='postgresql'
parser = ConfigParser()
parser.read(filename)
config = {}
if parser.has_section(section):
    params = parser.items(section)
    for param in params:
        config[param[0]] = param[1]
else:
    raise Exception(f'Section {section} not found in the {filename} file')

# Create an SQLAlchemy engine
db = create_engine(f"postgresql+psycopg2://{config['user']}:{config['password']}@{config['host']}:{config['port']}/{config['database']}")

In [20]:
# Format the table config
create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {database_table_name} ("""    
for index in range(len(database_columns)):        
    create_table_query = create_table_query + f"\n        {database_columns[index]} {database_datatypes[index]} {database_constraints[index]}"
    if index == len(database_columns) - 1:
        if len(foreign_key_table) > 0:
            create_table_query = create_table_query + f",\n        FOREIGN KEY ({foreign_key_home}) REFERENCES {foreign_key_table} ({foreign_key_away})\n    );"
        else:
            create_table_query = create_table_query + f"\n    );"
    else:
        create_table_query = create_table_query + ','

print("___________________________")
print(create_table_query)
print("___________________________")

# Create the table
try:
    # Execute the query
    with db.connect() as connection:
        connection.execute(text(create_table_query))
        connection.commit()
        connection.close()
    print("Table created successfully!")

except Exception as e:
    print(f"Error creating table: {e}")

___________________________

    CREATE TABLE IF NOT EXISTS premature_death_0000 (
        id SERIAL PRIMARY KEY,
        location_id TEXT NOT NULL,
        year VARCHAR(4) NOT NULL,
        premature_death_raw REAL NOT NULL,
        FOREIGN KEY (location_id) REFERENCES location (location_id)
    );
___________________________
Table created successfully!


In [21]:
try:
    # Create the connection
    conn = db.connect()

    # Insert the DataFrame values into the table
    dfCombined.to_sql(database_table_name, con=conn, if_exists='append', index=False) 

    # Commits the changes and closes the connection
    conn.commit()
    conn.close()
    print("Database Upload Complete")
except Exception as e:
    print(f"Error: {e}")

Database Upload Complete


In [22]:
try:
    # Execute the query
    with db.connect() as connection:
        query = text(f"SELECT * FROM {database_table_name}")
        result = connection.execute(query)
        for row in result.fetchall():
            print(row)

except Exception as e:
    print(f"Error querying the database: {e}")

(1, '000100000', '2010', 7261.2)
(2, '000100000', '2011', 7198.3)
(3, '000100000', '2012', 7082.91)
(4, '000100000', '2013', 6811.19)
(5, '000100000', '2014', 6811.19)
(6, '000100000', '2015', 6621.61)
(7, '000100000', '2016', 6605.3)
(8, '000100000', '2017', 6601.2)
(9, '000100000', '2018', 6658.114)
(10, '000100000', '2019', 6900.6304)
(11, '000100000', '2020', 6940.1104)
(12, '000100000', '2021', 6906.641)
(13, '000100000', '2022', 7281.9355)
(14, '000100000', '2023', 7281.9355)
(15, '000101000', '2010', 10150.4)
(16, '000101000', '2011', 10189.2)
(17, '000101000', '2012', 10152.06)
(18, '000101000', '2013', 9608.9)
(19, '000101000', '2014', 9608.9)
(20, '000101000', '2015', 9507.9)
(21, '000101000', '2016', 9544.7)
(22, '000101000', '2017', 9573.2)
(23, '000101000', '2018', 9642.412)
(24, '000101000', '2019', 9917.232)
(25, '000101000', '2020', 9942.795)
(26, '000101000', '2021', 9819.888)
(27, '000101000', '2022', 10350.071)
(28, '000101000', '2023', 10350.071)
(29, '000101001', '