In [324]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import psycopg2
from config import db_password

pd.set_option("display.float_format", lambda x : f"{x:.2f}")

In [325]:
#Add the function that takes in climate change argument for ETL;
def extract_transform_load(climate_change):
    #Read the excel sheet
    climate_change = pd.read_excel(climate_change)
    
    #Read the climate change data to a dataframe
    climate_change_df = pd.DataFrame(climate_change)
    
    #Drop unnecessary columns from the DataFrame.(SCALE and Decimals have no use in our analysis and 2011 has bad data)
    climate_change_df.drop(columns = ['SCALE','Decimals', 2011], axis = 1, inplace = True)
    df = climate_change_df
    
    #Select only the necessary Series code from the list and filter the dataframe
    filter = ["SP.POP.TOTL",
              "SP.POP.GROW",
              "NY.GDP.MKTP.CD",
              "NY.GNP.PCAP.CD",
              "EN.CLC.MMDT.C",
              "EN.CLC.HPPT.MM",
              "SP.URB.TOTL",
              "SP.URB.GROW",
              "AG.LND.IRIG.AG.ZS",
              "EN.ATM.CO2E.PC",
              "EN.ATM.CO2E.PP.GD.KD",
              "EN.ATM.CO2E.KT",
              "EN.ATM.METH.KT.CE",
              "EN.ATM.NOXE.KT.CE",
              "EN.ATM.GHGO.KT.CE",
              "EG.USE.PCAP.KG.OE",
              "EG.USE.COMM.GD.PP.KD",
              "EN.CLC.AERT",
              "EN.CLC.RNET"]
    df1 = df[climate_change_df["Series code"].isin(filter)].copy()
    
    #Clean the data
    df2 = df1[df1.columns[4:25]].replace("..", "0")
    df2 = df2.fillna("0")
    df2 = df2.replace("0", np.nan)
    #df1[df1.columns[4:25]].dropna(how = all, axis = 0)
    
    df1.loc[:,df1.columns[4:25]] = df2
    
    #Convert the datatypes from object to float
    df1[df1.columns[4:25]] = df1[df1.columns[4:25]].astype(float)  
    
    #Return the cleaned dataframe
    return df1


In [326]:
#climate_change = pd.read_excel('climate_change_download_0.xls')
climate_change = "../Resources/climate_change_download_0.xls"
climate_change_ETL = extract_transform_load(climate_change)
ETL_df = climate_change_ETL

#Remove rows only if all years have no values
ETL_df = ETL_df.dropna(thresh = 20)
ETL_df
#index = ETL_df[(ETL_df[1990]== 0) & (ETL_df[1991]== 0) & (ETL_df[1992]== 0) & ETL_df[1993]== 0) & (ETL_df[1994]== 0) & (ETL_df[1995]== 0) & (ETL_df[1996]== 0) & (ETL_df[1997]== 0) & (ETL_df[1998]== 0) & ETL_df[1999]== 0) & (ETL_df[2000]== 0) & (ETL_df[2001]== 0) &(ETL_df[2002]== 0) & (ETL_df[2003]== 0) & (ETL_df[2004]== 0) & ETL_df[2005]== 0) & (ETL_df[1994]== 0) & (ETL_df[1995]== 0) & (ETL_df[1996]== 0) & (ETL_df[1997]== 0) ].index
#ETL_df.drop(index, inplace = True)

Unnamed: 0,Country code,Country name,Series code,Series name,1990,1991,1992,1993,1994,1995,...,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
1168,AGO,Angola,EG.USE.COMM.GD.PP.KD,"Energy use per units of GDP (kg oil eq./$1,000...",172.98,179.27,193.63,266.32,260.83,237.62,...,206.28,188.82,195.88,191.54,151.65,138.93,121.61,114.49,116.92,
1169,ALB,Albania,EG.USE.COMM.GD.PP.KD,"Energy use per units of GDP (kg oil eq./$1,000...",206.75,205.47,160.75,143.72,141.20,117.05,...,113.18,122.14,120.07,110.05,118.57,106.51,99.68,89.98,72.34,
1170,ARE,United Arab Emirates,EG.USE.COMM.GD.PP.KD,"Energy use per units of GDP (kg oil eq./$1,000...",214.32,245.98,227.82,241.74,249.83,245.91,...,254.04,268.41,237.47,228.61,215.80,208.92,223.71,240.51,247.13,
1171,ARG,Argentina,EG.USE.COMM.GD.PP.KD,"Energy use per units of GDP (kg oil eq./$1,000...",189.25,173.12,162.69,153.55,155.26,161.30,...,159.33,171.24,167.73,175.35,159.82,160.57,148.85,144.83,139.64,
1172,ARM,Armenia,EG.USE.COMM.GD.PP.KD,"Energy use per units of GDP (kg oil eq./$1,000...",739.54,882.76,769.74,446.77,265.61,296.93,...,258.32,213.35,197.44,188.92,199.45,179.09,175.94,173.33,175.50,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13507,YEM,"Yemen, Rep.",SP.URB.TOTL,Urban population,2497175.68,2693642.26,2909756.36,3139637.32,3373929.85,3605264.94,...,4898943.35,5148619.31,5410331.21,5683412.00,5967457.83,6275723.04,6597265.19,6932788.71,7283068.41,7648699.45
13508,ZAF,South Africa,SP.URB.TOTL,Urban population,18304000.00,18864881.58,19446091.55,20048481.07,20672940.24,21320400.00,...,25769207.66,26345562.75,26904362.60,27448218.30,27988692.12,28533559.52,29079838.13,29636881.56,30193795.83,30844632.10
13509,ZAR,"Congo, Dem. Rep.",SP.URB.TOTL,Urban population,10120930.83,10569454.39,11060046.85,11568627.56,12061489.85,12515132.80,...,15429399.70,16125336.27,16867832.79,17640845.61,18431987.56,19333729.39,20261443.14,21216476.38,22201848.32,23219959.84
13510,ZMB,Zambia,SP.URB.TOTL,Urban population,3096860.88,3141668.30,3183257.23,3223514.67,3264940.18,3309118.18,...,3640719.03,3729882.68,3819640.74,3912870.73,4011827.75,4128986.90,4253139.48,4384858.57,4524564.08,4614728.01


In [327]:

ETL_df.dtypes


Country code     object
Country name     object
Series code      object
Series name      object
1990            float64
1991            float64
1992            float64
1993            float64
1994            float64
1995            float64
1996            float64
1997            float64
1998            float64
1999            float64
2000            float64
2001            float64
2002            float64
2003            float64
2004            float64
2005            float64
2006            float64
2007            float64
2008            float64
2009            float64
2010            float64
dtype: object

In [329]:
#create the connection to the PostgreSQL database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/climate_change"

#create the db engine
engine = create_engine(db_string)
    
# Load our datafrme into sql
ETL_df.to_sql(name='climateChangeData', con=engine, index= False, if_exists='replace')

#save cleaned data into csv
ETL_df.to_csv('../Resources/climateChangeData.csv',index= False)