# Project Group # 2
> Group Members: Sadia Khan Durani, Safa Sajid, Mariam Virk

Proposed Idea: Analyzing the influence of Climate Change on Natural Disasters vs its Effects on Global Development.

Research Question: **How are countries affected by natural disasters, and can these disasters be linked to climate change?**

> ## Begin Dataset Cleaning

In [1]:
# Required import statements
import pandas as pd
import numpy as np
import csv

In [2]:
# List of countries we will focus on in this project
filtered_countries = ['Canada', 'United States','United States of America','United States of America (the)','Mexico']

## 1 : Climate Dataset

In [3]:
# Load in the original climate change dataset
climate = pd.read_csv("data/climate_change_data.csv")
climate.head()

Unnamed: 0,Date,Location,Country,Temperature,CO2 Emissions,Sea Level Rise,Precipitation,Humidity,Wind Speed
0,2000-01-01 00:00:00.000000000,New Williamtown,Latvia,10.688986,403.118903,0.717506,13.835237,23.631256,18.492026
1,2000-01-01 20:09:43.258325832,North Rachel,South Africa,13.81443,396.663499,1.205715,40.974084,43.982946,34.2493
2,2000-01-02 16:19:26.516651665,West Williamland,French Guiana,27.323718,451.553155,-0.160783,42.697931,96.6526,34.124261
3,2000-01-03 12:29:09.774977497,South David,Vietnam,12.309581,422.404983,-0.475931,5.193341,47.467938,8.554563
4,2000-01-04 08:38:53.033303330,New Scottburgh,Moldova,13.210885,410.472999,1.135757,78.69528,61.789672,8.001164


In [4]:
# Filter for only the biggest North American countries (Canada, USA, Mexico)
climate = climate[(climate['Country'].isin(filtered_countries))]
climate.head(1)

Unnamed: 0,Date,Location,Country,Temperature,CO2 Emissions,Sea Level Rise,Precipitation,Humidity,Wind Speed
56,2000-02-17 01:04:22.466246624,Garciatown,United States of America,13.504183,354.390467,-0.484336,10.686071,35.823373,31.527324


In [5]:
# Slice the 'Date' column to only keep the Year value : we'll look at it from a yearly perspective
climate['Date'] = climate['Date'].str.slice(0, 4).astype(int)
climate.head(1)

Unnamed: 0,Date,Location,Country,Temperature,CO2 Emissions,Sea Level Rise,Precipitation,Humidity,Wind Speed
56,2000,Garciatown,United States of America,13.504183,354.390467,-0.484336,10.686071,35.823373,31.527324


In [6]:
# Filter rows to only keep years between 2000 and 2020
climate = climate[(climate['Date'] >= 2000) & (climate['Date'] <= 2020)]
climate.shape

(103, 9)

In [7]:
# Drop the 'Location' column
climate = climate.drop(columns = {"Location"})
climate.head(1)

Unnamed: 0,Date,Country,Temperature,CO2 Emissions,Sea Level Rise,Precipitation,Humidity,Wind Speed
56,2000,United States of America,13.504183,354.390467,-0.484336,10.686071,35.823373,31.527324


In [8]:
# Rename 'Date' to 'Year'
climate = climate.rename(columns = {"Date" : "Year"})
climate.head(1)

Unnamed: 0,Year,Country,Temperature,CO2 Emissions,Sea Level Rise,Precipitation,Humidity,Wind Speed
56,2000,United States of America,13.504183,354.390467,-0.484336,10.686071,35.823373,31.527324


In [9]:
# Print the unique values of 'Country' column
climate['Country'].unique()

array(['United States of America', 'Mexico', 'Canada'], dtype=object)

In [10]:
# Change 'United States of America' to 'United States'
# Create a dictionary to map country names
country_dict = {'United States of America' : 'United States',
                'Mexico' : 'Mexico', 
                'Canada' : 'Canada'}

# Replace the country names with their new equivalents
climate['Country'] = climate['Country'].map(country_dict)
climate['Country'].unique()

array(['United States', 'Mexico', 'Canada'], dtype=object)

In [11]:
# To eliminate duplicate (Year, Country) pairs, take average of Temperature and CO2 Emissions
# This is done to ensure the primary key of Climate Metrics is valid

# Take average of temperature across year-country pairs
average_temp = climate.groupby(['Year', 'Country'])['Temperature'].mean().reset_index()
average_temp = average_temp.rename(columns = {"Temperature" : "Avg Temperature"})

# Take average of CO2 Emissions across year-country pairs
average_CO2 = climate.groupby(['Year', 'Country'])['CO2 Emissions'].mean().reset_index()
average_CO2 = average_CO2.rename(columns = {'CO2 Emissions' : 'Avg CO2 Emissions'})

average_temp.head(1), average_CO2.head(1)

(   Year Country  Avg Temperature
 0  2000  Canada         8.704837,
    Year Country  Avg CO2 Emissions
 0  2000  Canada         430.252678)

In [12]:
# To eliminate duplicate (Year, Country) pairs, take max of the rest of the columns, including temp and co2 emissions
climate = climate.groupby(['Year', 'Country']).max().reset_index()
climate = climate.rename(columns = {"Temperature" : "Max Temperature", 
                                    "CO2 Emissions" : "Max CO2 Emissions"})
climate.head(1)

Unnamed: 0,Year,Country,Max Temperature,Max CO2 Emissions,Sea Level Rise,Precipitation,Humidity,Wind Speed
0,2000,Canada,13.312956,440.546108,-0.159359,59.89413,73.853461,32.547627


In [13]:
# Merge average_temp and average_CO2 columns to climate
climate = climate.merge(average_temp, on=['Year', 'Country'], how='right')
climate = climate.merge(average_CO2, on=['Year', 'Country'], how='right')

# adjust the ordering of columns
order = ['Country', 'Year', 'Max Temperature', 'Avg Temperature', 'Max CO2 Emissions', 
         'Avg CO2 Emissions', 'Sea Level Rise', 'Precipitation', 'Humidity', 'Wind Speed']
climate = climate[order]
climate.head(1)

Unnamed: 0,Country,Year,Max Temperature,Avg Temperature,Max CO2 Emissions,Avg CO2 Emissions,Sea Level Rise,Precipitation,Humidity,Wind Speed
0,Canada,2000,13.312956,8.704837,440.546108,430.252678,-0.159359,59.89413,73.853461,32.547627


In [14]:
print(f"There are {climate.shape[0]} rows remaining in the Climate Metrics dataset \n")

There are 52 rows remaining in the Climate Metrics dataset 



In [15]:
# Group by 'Country' and 'Year' 
pair_counts = climate.groupby(['Country', 'Year']).size()

# Check to see there is count of 1 for each 'Year' - 'Country' pair
sum(pair_counts.values == 1)

52

In [16]:
# No missing data in dataset
climate.isnull().sum()

Country              0
Year                 0
Max Temperature      0
Avg Temperature      0
Max CO2 Emissions    0
Avg CO2 Emissions    0
Sea Level Rise       0
Precipitation        0
Humidity             0
Wind Speed           0
dtype: int64

## 2 : Human Development/Population Dataset

In [17]:
# Load in the original human development/population dataset
hdi_population = pd.read_csv("data/hdi_population_data.csv")
hdi_population.head()

Unnamed: 0,Entity,Code,Year,Augmented Human Development Index (AHDI),814486-annotations,GDP per capita,417485-annotations,Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,,,Asia
1,Afghanistan,AFG,1950,0.050043,,1156.0,,7480464.0,
2,Afghanistan,AFG,1955,0.053487,,1246.0,,7971933.0,
3,Afghanistan,AFG,1960,0.058654,,1326.0,,8622473.0,
4,Afghanistan,AFG,1965,0.068907,,1290.0,,9565154.0,


In [18]:
# Filter for only the biggest North American countries (Canada, USA, Mexico)
hdi_population = hdi_population[(hdi_population['Entity'].isin(filtered_countries))]
hdi_population.head(1)

Unnamed: 0,Entity,Code,Year,Augmented Human Development Index (AHDI),814486-annotations,GDP per capita,417485-annotations,Population (historical estimates),Continent
9032,Canada,CAN,1870,0.23021,,2702.0,,3761273.0,


In [19]:
# Filter rows to only keep years between 2000 and 2020
hdi_population = hdi_population[(hdi_population['Year'] >= 2000) & (hdi_population['Year'] <= 2020)]
hdi_population.head(1)

Unnamed: 0,Entity,Code,Year,Augmented Human Development Index (AHDI),814486-annotations,GDP per capita,417485-annotations,Population (historical estimates),Continent
9051,Canada,CAN,2000,0.725572,,36942.562,,30683316.0,


In [20]:
# Drop the unnecessary columns
hdi_population = hdi_population.drop(columns = {"Code", "814486-annotations", "417485-annotations", "Continent"})
hdi_population.head(1)

Unnamed: 0,Entity,Year,Augmented Human Development Index (AHDI),GDP per capita,Population (historical estimates)
9051,Canada,2000,0.725572,36942.562,30683316.0


In [21]:
# Rename the columns to appropriate names
hdi_population = hdi_population.rename(columns = {"Entity" : "Country", 
                                                  "Augmented Human Development Index (AHDI)" : "AHDI", 
                                                  "GDP per capita" : "GDP Per Capita", 
                                                  "Population (historical estimates)" : "Population"})
hdi_population.head(1)

Unnamed: 0,Country,Year,AHDI,GDP Per Capita,Population
9051,Canada,2000,0.725572,36942.562,30683316.0


In [22]:
print(f"There are {hdi_population.shape[0]} rows remaining in the Human Development/Population dataset \n")

There are 63 rows remaining in the Human Development/Population dataset 



In [23]:
# Check that the count is 1 for each 'Year'-'Country' pair for the primary key of Socio-Economic Indicators
pair_counts = hdi_population.groupby(['Year', 'Country']).size()

# Check that there is count of 1 for each 'Year' - 'Country' pair in this dataset
sum(pair_counts.values == 1)

63

In [25]:
# Includes some missing data in dataset
# This is okay since SQL will take care of it
hdi_population.isnull().sum()

Country            0
Year               0
AHDI              48
GDP Per Capita     6
Population         0
dtype: int64

## 3 : Disasters Dataset

In [26]:
# Load in the original disasters dataset
disasters = pd.read_csv("data/natural_disaster_data.csv")
disasters.head()

Unnamed: 0,Dis No,Year,Seq,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,Disaster Subsubtype,Event Name,Entry Criteria,...,End Day,Total Deaths,No Injured,No Affected,No Homeless,Total Affected,Reconstruction Costs ('000 US$),Insured Damages ('000 US$),Total Damages ('000 US$),CPI
0,1900-9002-CPV,1900,9002,Natural,Climatological,Drought,Drought,,,,...,,11000.0,,,,,,,,3.261389
1,1900-9001-IND,1900,9001,Natural,Climatological,Drought,Drought,,,,...,,1250000.0,,,,,,,,3.261389
2,1902-0012-GTM,1902,12,Natural,Geophysical,Earthquake,Ground movement,,,Kill,...,18.0,2000.0,,,,,,,25000.0,3.391845
3,1902-0003-GTM,1902,3,Natural,Geophysical,Volcanic activity,Ash fall,,Santa Maria,Kill,...,8.0,1000.0,,,,,,,,3.391845
4,1902-0010-GTM,1902,10,Natural,Geophysical,Volcanic activity,Ash fall,,Santa Maria,Kill,...,24.0,6000.0,,,,,,,,3.391845


In [27]:
# Filter for only the biggest North American countries (Canada, USA, Mexico)
disasters = disasters[(disasters['Country'].isin(filtered_countries))]
disasters.head(1)

Unnamed: 0,Dis No,Year,Seq,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,Disaster Subsubtype,Event Name,Entry Criteria,...,End Day,Total Deaths,No Injured,No Affected,No Homeless,Total Affected,Reconstruction Costs ('000 US$),Insured Damages ('000 US$),Total Damages ('000 US$),CPI
5,1903-0006-CAN,1903,6,Natural,Geophysical,Mass movement (dry),Rockfall,,,Kill,...,29.0,76.0,23.0,,,23.0,,,,3.5223


In [28]:
# Filter rows to only keep years between 2000 and 2020
disasters = disasters[(disasters['Year'] >= 2000) & (disasters['Year'] <= 2020)]
disasters.head(1)

Unnamed: 0,Dis No,Year,Seq,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,Disaster Subsubtype,Event Name,Entry Criteria,...,End Day,Total Deaths,No Injured,No Affected,No Homeless,Total Affected,Reconstruction Costs ('000 US$),Insured Damages ('000 US$),Total Damages ('000 US$),CPI
6668,2000-0420-CAN,2000,420,Natural,Meteorological,Storm,Convective storm,Tornado,,Kill,...,14.0,11.0,140.0,,700.0,840.0,,10000.0,13000.0,67.355759


In [29]:
# Keep only important columns from the dataset 
disasters_columns = ["Dis No", "Year", "Disaster Subgroup", "Disaster Type", "Disaster Subtype",
                     "Country", "Start Month",
                     "Total Deaths", "No Injured", "No Affected", "No Homeless", "Total Affected",
                     "Insured Damages ('000 US$)", "Total Damages ('000 US$)", "CPI"]

disasters = disasters[disasters_columns]
disasters.head(1)

Unnamed: 0,Dis No,Year,Disaster Subgroup,Disaster Type,Disaster Subtype,Country,Start Month,Total Deaths,No Injured,No Affected,No Homeless,Total Affected,Insured Damages ('000 US$),Total Damages ('000 US$),CPI
6668,2000-0420-CAN,2000,Meteorological,Storm,Convective storm,Canada,7.0,11.0,140.0,,700.0,840.0,10000.0,13000.0,67.355759


In [30]:
# Rename the columns to appropriate names
disasters = disasters.rename(columns = {"Start Month" : "Month",
                                        "No Injured" : "# Injured", 
                                        "No Affected" : "# Affected", 
                                        "No Homeless" : "# Homeless", 
                                        "Insured Damages ('000 US$)" : "Insured Damages", 
                                        "Total Damages ('000 US$)" : "Total Damages"})
disasters.head(1)

Unnamed: 0,Dis No,Year,Disaster Subgroup,Disaster Type,Disaster Subtype,Country,Month,Total Deaths,# Injured,# Affected,# Homeless,Total Affected,Insured Damages,Total Damages,CPI
6668,2000-0420-CAN,2000,Meteorological,Storm,Convective storm,Canada,7.0,11.0,140.0,,700.0,840.0,10000.0,13000.0,67.355759


In [31]:
# Reorder the columns with consistent ordering with schema
order = ['Dis No', 'Country', 'Year', 'Month', 'Disaster Subgroup', 'Disaster Type', 'Disaster Subtype', 
         'Total Affected', 'Total Deaths', '# Injured', '# Affected', '# Homeless', 'Total Damages', 
         'Insured Damages', 'CPI']

disasters = disasters[order]
disasters.head(1)

Unnamed: 0,Dis No,Country,Year,Month,Disaster Subgroup,Disaster Type,Disaster Subtype,Total Affected,Total Deaths,# Injured,# Affected,# Homeless,Total Damages,Insured Damages,CPI
6668,2000-0420-CAN,Canada,2000,7.0,Meteorological,Storm,Convective storm,840.0,11.0,140.0,,700.0,13000.0,10000.0,67.355759


In [32]:
# Print the unique values of the 'Country' column
disasters['Country'].unique()

array(['Canada', 'Mexico', 'United States of America (the)'], dtype=object)

In [33]:
# Change 'United States of America (the)' to 'United States'
# Create a dictionary to map country name to its version
country_dict = {'United States of America (the)' : 'United States',
                'Mexico' : 'Mexico', 
                'Canada' : 'Canada'}

# Replace the country names with their new equivalents
disasters['Country'] = disasters['Country'].map(country_dict)
disasters['Country'].unique()

array(['Canada', 'Mexico', 'United States'], dtype=object)

In [34]:
# Print the unique values of the 'Month' column
disasters['Month'].unique()

array([ 7.,  2.,  6.,  9.,  8., 12.,  5., 10.,  1., 11.,  3.,  4.])

In [35]:
# Change the Month column from numerical encoding to names of the months
# Create a dictionary to map month # to month name
month_dict = {1.0 : "Jan", 2.0 : "Feb", 3.0 : "Mar", 4.0 : "Apr", 5.0 : "May", 6.0 : "Jun", 
              7.0 : "Jul", 8.0 : "Aug", 9.0 : "Sept", 10.0 : "Oct", 11.0 : "Nov", 12.0 : "Dec"}

# Replace the month # with their equivalent names
disasters['Month'] = disasters['Month'].map(month_dict)
disasters['Month'].unique()

array(['Jul', 'Feb', 'Jun', 'Sept', 'Aug', 'Dec', 'May', 'Oct', 'Jan',
       'Nov', 'Mar', 'Apr'], dtype=object)

In [36]:
# Includes some missing data in dataset
# This is okay since SQL will take care of it
disasters.isnull().sum()

Dis No                 0
Country                0
Year                   0
Month                  0
Disaster Subgroup      0
Disaster Type          0
Disaster Subtype      67
Total Affected       242
Total Deaths         187
# Injured            532
# Affected           366
# Homeless           606
Total Damages        259
Insured Damages      430
CPI                   33
dtype: int64

In [37]:
# Filter rows with entries for 'Total Damages', 'Insured Damages', 'CPI'
disasters_subset = disasters[disasters['Total Damages'].notnull() & disasters['Insured Damages'].notnull() & disasters['CPI'].notnull()]
disasters_subset.shape

(246, 15)

In [39]:
# Keeping relevant columns
disasters_subset = disasters_subset[['Dis No', 'Total Damages', 'Insured Damages', 'CPI']]
disasters_subset.head(1)

Unnamed: 0,Dis No,Total Damages,Insured Damages,CPI
6668,2000-0420-CAN,13000.0,10000.0,67.355759


In [40]:
# No missing data in subset dataset
disasters_subset.isnull().sum()

Dis No             0
Total Damages      0
Insured Damages    0
CPI                0
dtype: int64

> ## Done Dataset Cleaning

<br><br><br>

## SQL - CREATE TABLE STATEMENTS +  GENERATING INSERT STATEMENTS

* We will now use our cleaned up data frames to generate insert statements.

<br>

In [41]:
climate.head(1)

Unnamed: 0,Country,Year,Max Temperature,Avg Temperature,Max CO2 Emissions,Avg CO2 Emissions,Sea Level Rise,Precipitation,Humidity,Wind Speed
0,Canada,2000,13.312956,8.704837,440.546108,430.252678,-0.159359,59.89413,73.853461,32.547627


In [42]:
hdi_population.head(1)

Unnamed: 0,Country,Year,AHDI,GDP Per Capita,Population
9051,Canada,2000,0.725572,36942.562,30683316.0


In [43]:
disasters.head(1)

Unnamed: 0,Dis No,Country,Year,Month,Disaster Subgroup,Disaster Type,Disaster Subtype,Total Affected,Total Deaths,# Injured,# Affected,# Homeless,Total Damages,Insured Damages,CPI
6668,2000-0420-CAN,Canada,2000,Jul,Meteorological,Storm,Convective storm,840.0,11.0,140.0,,700.0,13000.0,10000.0,67.355759


In [44]:
disasters_subset.head(1)

Unnamed: 0,Dis No,Total Damages,Insured Damages,CPI
6668,2000-0420-CAN,13000.0,10000.0,67.355759


<br>

## **Relational Schema**
> ### Primary keys are <u>underlined</u>, foreign keys are **bolded**, candidate keys are _italicized_

<br>

Climate_Metrics( _<u>Country</u> , <u>Year</u>_ , Max_Temperature, Avg_Temperature, Max_CO2_Emissions, Avg_CO2_Emissions, Sea_Level_Rise, Humidity, Wind_Speed, Precipitation)

<br>

Socio_Economic_Indicators( _<u>Socio_Country</u> , <u>Socio_Year</u>_ , AHDI, GDP_Per_Capita, Population)

<br>

Natural_Disaster( _<u>Disaster_No</u>_ , **Disaster_Country, Disaster_Year,** Disaster_Month, Disaster_Subgroup, Disaster_Type, Disaster_Subtype)
> - Disaster_Country and Disaster_Year will be foreign keys to Climate_Metrics(<u>Country</u> , <u>Year</u> )

<br>

Damages( _**<u>Disaster_No**</u>_, **Disaster_Country, Disaster_Year,** Total_Affected, Total_Deaths, No_Injured, No_Affected, No_Homeless)
> - Disaster_No is the primary key and will be a foreign key to Natural_Disaster(<u>Disaster_No</u>)
> - Disaster_Country and Disaster_Year will be foreign keys to Socio_Economic_Indicators( <u>Socio_Country</u> , <u>Socio_Year</u> )

<br>

Economic_Damages(_**<u>Disaster_No**</u>_, _ _Total_Damages_ _ , Insured_Damages,  CPI) 
> - Disaster_No is the primary key and foreign key to Damages
> - Total_Damages is the partial key of Economic_Damages
> - FOREIGN KEY (Disaster_No) REFERENCES Damages 
    - ON DELETE CASCADE
    - ON UPDATE CASCADE  
        * NOTE: we did not include this in our sql statments as it's not supported by Oracle
        
<br>  

NOTE: our tables' columns for SQL DDL file are different than the data frame columns due to the database syntax requirements.
        
<br><br>

In [46]:
sql_statement = """
CREATE TABLE Climate_Metrics( 
    Country_Name CHAR(20) , 
    Year_Recorded INT , 
    Max_Temperature FLOAT, 
    Avg_Temperature FLOAT, 
    Max_CO2_Emissions FLOAT, 
    Avg_CO2_Emissions FLOAT, 
    Sea_Level_Rise FLOAT, 
    Humidity FLOAT, 
    Wind_Speed FLOAT, 
    Precipitation FLOAT,
    PRIMARY KEY (Country_Name, Year_Recorded)
);


CREATE TABLE Socio_Economic_Indicators(
    Socio_Country CHAR(20) , 
    Socio_Year INT , 
    AHDI DECIMAL(10, 7), 
    GDP_Per_Capita FLOAT, 
    Population FLOAT,
    PRIMARY KEY (Socio_Country, Socio_Year)
);


CREATE TABLE Natural_Disaster( 
    Disaster_No VARCHAR(50),
    Disaster_Country CHAR(20) NOT NULL , 
    Disaster_Year INT NOT NULL,
    Disaster_Month CHAR(20) , 
    Disaster_Subgroup CHAR(20) , 
    Disaster_Type VARCHAR(50) , 
    Disaster_Subtype VARCHAR(50) ,
    PRIMARY KEY (Disaster_No) ,
    FOREIGN KEY (Disaster_Country, Disaster_Year) REFERENCES Climate_Metrics(Country_Name, Year_Recorded)
);


CREATE TABLE Damages( 
    Disaster_No VARCHAR(50) ,
    Disaster_Country CHAR(20) , 
    Disaster_Year INT,
    Total_Affected FLOAT, 
    Total_Deaths FLOAT, 
    No_Injured FLOAT, 
    No_Affected FLOAT, 
    No_Homeless FLOAT,
    PRIMARY KEY (Disaster_No) ,
    FOREIGN KEY (Disaster_Country, Disaster_Year) REFERENCES Socio_Economic_Indicators(Socio_Country, Socio_Year),
    FOREIGN KEY (Disaster_No) REFERENCES Natural_Disaster(Disaster_No)
        ON DELETE CASCADE
);


CREATE TABLE Economic_Damages(
    Disaster_No VARCHAR(50),
    Total_Damages FLOAT,
    Insured_Damages FLOAT,
    CPI FLOAT, 
    PRIMARY KEY (Disaster_No, Total_Damages) ,
    FOREIGN KEY (Disaster_No) REFERENCES Damages(Disaster_No)
        ON DELETE CASCADE
);

"""

### Function to write to file

In [47]:
# Open a new file to store the sql statements
with open("sql_statements.sql", "w") as file:
    file.write(sql_statement)

In [48]:
# Function to generate INSERT statements

def generate_insert_statements_from_df(df, table_name, df_columns, table_columns):
    if len(df_columns) != len(table_columns):
        raise ValueError("The length of df columns and table columns does not match.")

    insert_statements = []
    table_columns_sql = ', '.join(table_columns)
    
    for index, row in df.iterrows():
        values = []
        for column in df_columns:
            value = row[column]
            # Check if the value is NaN
            if pd.isna(value):
                values.append("NULL")
            # Check if the value is a string
            elif isinstance(value, str):
                # Escape single quotes for SQL
                escaped_value = value.replace("'", "''")
                values.append(f"'{escaped_value}'")
            else:
                # For non-string, non-NULL values, append the value directly
                values.append(str(value))
        values_sql = ', '.join(values)
        insert_statement = f"INSERT INTO {table_name} ({table_columns_sql}) VALUES ({values_sql});"
        insert_statements.append(insert_statement)
    
    return insert_statements


# Function to add INSERT statements to sql file

def add_insert_statements(insert_statements, file_name='sql_statements.sql'):
    with open(file_name, 'a') as file:
        for statement in insert_statements:
            file.write(statement + '\n')

### Calling the functions for each table:

In [49]:
# Climate_Metrics table
climate_df_cols = ['Country', 'Year', 'Max Temperature', 'Avg Temperature', 'Max CO2 Emissions', 
                'Avg CO2 Emissions', 'Sea Level Rise', 'Humidity', 'Wind Speed', 'Precipitation']

climate_table_cols = ['Country_Name', 'Year_Recorded', 'Max_Temperature', 'Avg_Temperature', 'Max_CO2_Emissions', 
                'Avg_CO2_Emissions', 'Sea_Level_Rise', 'Humidity', 'Wind_Speed', 'Precipitation']


insert_stats = generate_insert_statements_from_df(climate, 'Climate_Metrics', climate_df_cols, climate_table_cols)
add_insert_statements(insert_stats)

In [50]:
# Socio_Economic_Indicators table
indicator_df_cols = ['Country', 'Year', 'AHDI', 'GDP Per Capita', 'Population']
indicator_table_cols = ['Socio_Country', 'Socio_Year', 'AHDI', 'GDP_Per_Capita', 'Population']


insert_stats = generate_insert_statements_from_df(hdi_population, 'Socio_Economic_Indicators', indicator_df_cols, indicator_table_cols)
add_insert_statements(insert_stats)

In [51]:
# Natural_Disaster table
disasters_df_cols = ['Dis No', 'Country', 'Year', 'Month', 'Disaster Subgroup', 'Disaster Type', 'Disaster Subtype']
disasters_table_cols = ['Disaster_No', 'Disaster_Country', 'Disaster_Year', 'Disaster_Month', 'Disaster_Subgroup', 'Disaster_Type', 'Disaster_Subtype']

insert_stats = generate_insert_statements_from_df(disasters, 'Natural_Disaster', disasters_df_cols, disasters_table_cols)
add_insert_statements(insert_stats)

In [52]:
# Damages table
damages_df_cols = ['Dis No', 'Country', 'Year', 'Total Affected', 'Total Deaths', '# Injured', '# Affected', '# Homeless']
damages_table_cols = ['Disaster_No', 'Disaster_Country', 'Disaster_Year', 'Total_Affected', 'Total_Deaths', 'No_Injured', 'No_Affected', 'No_Homeless']

insert_stats = generate_insert_statements_from_df(disasters, 'Damages', damages_df_cols,damages_table_cols)
add_insert_statements(insert_stats)

In [53]:
# Economic_Damages table
economic_damages_df_cols = ['Dis No', 'Total Damages', 'Insured Damages', 'CPI']
economic_damages_table_cols = ['Disaster_No', 'Total_Damages', 'Insured_Damages', 'CPI']

insert_stats = generate_insert_statements_from_df(disasters_subset, 'Economic_Damages', economic_damages_df_cols, economic_damages_table_cols)
add_insert_statements(insert_stats)

<br><br>

### CITATIONS:


Pandas. (2024). IO tools (text, CSV, HDF5, …). Retrieved March 9, 2024, from https://pandas.pydata.org/docs/user_guide/io.html

SQLServerCentral. (2023, May 5). Python 3 Script for Generating SQL INSERT Statements from CSV Data. SQLServerCentral. Retrieved March 8, 2024, from https://www.sqlservercentral.com/scripts/python-3-script-for-generating-sql-insert-statements-from-csv-data

<br><br>