In [None]:
# Import all the necessary library
import pandas as pd
from sqlalchemy import create_engine
from config import username, password

# EXTRACT

In [None]:
# Extract the two dataset CSVs into DataFrames
## Load Happiness data
happiness_file = 'Resources/world_happiness_2019.csv'
happiness_df = pd.read_csv(happiness_file)
happiness_df.head(3)

In [None]:
countries_file = "Resources/countries of the world.csv"
countries_df = pd.read_csv(countries_file)
countries_df.head(3)

# TRANSFORM

Wolrd Countries

In [None]:
## Happiness df ##
# Create a filtered dataframe from specific columns
happiness_df_col = ['Country or region', 'Score', 'GDP per capita',
       'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption']
happiness_transformed_df = happiness_df[happiness_df_col].copy()

# Rename the column headers
happiness_transformed_df = happiness_transformed_df.rename(columns={
    'Country or region' : 'country',
    'Score': 'overall_score', 
    'GDP per capita': 'gdp_score',
    'Social support': 'social_support', 
    'Healthy life expectancy': 'healthy_life_expectancy', 
    'Freedom to make life choices': 'freedom_choices',
    'Generosity': 'generosity', 
    'Perceptions of corruption': 'corruption_perception'
})

# Clean the data by dropping duplicates and setting the index
happiness_transformed_df = happiness_transformed_df.dropna(how='any')
happiness_transformed_df.drop_duplicates('country', inplace=True)
happiness_transformed_df.set_index('country', inplace=True)
happiness_transformed_df

Wolrd Happiness

In [None]:
## Countries df ##
# Create a filtered dataframe from specific columns
countries_df_col = ['Country', 'Region', 'Pop. Density (per sq. mi.)', 'Infant mortality (per 1000 births)', 
                    'GDP ($ per capita)', 'Literacy (%)']
countries_transformed_df = countries_df[countries_df_col].copy()

# Rename the column headers
countries_transformed_df = countries_transformed_df.rename(columns={
    'Country': 'country', 
    'Region': 'region', 
    'Pop. Density (per sq. mi.)': 'pop_density', 
    'Infant mortality (per 1000 births)': 'infant_mortality', 
    'GDP ($ per capita)': 'gdp', 
    'Literacy (%)': 'literacy'
})

# Convert columns with string of numbers to float
countries_transformed_df['pop_density'] = countries_transformed_df.pop_density.str.replace(',', '.').astype(float)
countries_transformed_df['infant_mortality'] = countries_transformed_df.infant_mortality.str.replace(',', '.').astype(float)
countries_transformed_df['literacy'] = countries_transformed_df.literacy.str.replace(',', '.').astype(float)

# Remove the weird space after text in country column
countries_transformed_df['country'] = countries_transformed_df['country'].str.strip()

# Clean the data by dropping NaN, duplicates and setting the index
countries_transformed_df = countries_transformed_df.dropna(how='any')
countries_transformed_df.drop_duplicates('country', inplace=True)
countries_transformed_df.set_index('country', inplace=True)
countries_transformed_df

In [None]:
# Create database connection
connection_string = f'postgres:1327@localhost:5432/countries_happiness_db'
engine = create_engine(f'postgresql://{connection_string}')

In [None]:
# Confirm tables
engine.table_names()

# LOAD

Following the creation of the tables, we loaded the two dataframes into individual databases. 
We then merged the individual dataframes in order to get the final dataframe. 
We chose to combine these two datasets because we feel that the values in both sets could have an impact on the overall happiness score for each country. Further analysis can now be done to see which factors influence the happiness of a country. 

In [None]:
# Load Happiness Df into database
happiness_transformed_df.to_sql(name='happiness', con=engine, if_exists='append', index=True)

In [None]:
# Load Countries Df into database
countries_transformed_df.to_sql(name='country', con=engine, if_exists='append', index=True)

### Checking to make sure the two tables merge

In [None]:
# Test merging
merge_df = pd.merge(happiness_transformed_df, countries_transformed_df, on='country')
merge_df