In [1]:
from pyspark.sql.session import SparkSession
import config as c
import pyspark.sql.functions as fn
from pyspark.sql import window

In [2]:
#Getting spark session
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [3]:
#Getting temperature dataset
temperatureDF = spark.read.option("inferSchema","true")\
                        .option("header","true")\
                        .csv(c.download+"/GlobalLandTemperaturesByCountry.csv")

In [4]:
#Getting carbon per country dataset
carbonPerCountry = spark.read.option("inferSchema","true")\
                        .option("header","true")\
                        .csv(c.download+"/annual-co2-emissions-per-country.csv")

In [5]:
#Transforming carbon per country dataset columns
carbonPerCountry = carbonPerCountry.withColumnRenamed("Annual CO₂ emissions (tonnes)","annual_co2_emission_tonnes")\
                                    .withColumnRenamed("Entity","Country") \
                                    .select('Country','Year','annual_co2_emission_tonnes')

In [6]:
carbonPerEconomy =  spark.read.option("inferSchema","true")\
                        .option("header","true")\
                        .csv(c.download+"/carbon-emission-intensity-of-economies.csv")

In [7]:
carbonPerEconomy = carbonPerEconomy.withColumnRenamed("CO2 emissions (kg per 2011 PPP $ of GDP) (kg per 2011 PPP $ of GDP)","annual_co2_emission_gdp")\
                                    .withColumnRenamed("Entity","Country") \
                                    .select('Country','Year','annual_co2_emission_gdp')

In [8]:
gasPerSector = spark.read.option("inferSchema","true")\
                        .option("header","true")\
                        .csv(c.download+"/greenhouse-gas-emissions-by-sector.csv")

In [9]:
gasPerSector = gasPerSector.withColumnRenamed("Entity","Country")\
                            .withColumnRenamed("Other sources (tonnes)","source_others")\
                            .withColumnRenamed("International bunkers (tonnes)","source_bunkers")\
                            .withColumnRenamed("Waste (tonnes)","source_waste")\
                            .withColumnRenamed("Industry (tonnes)","source_industry")\
                            .withColumnRenamed("Residential & commercial (tonnes)","source_res_com")\
                            .withColumnRenamed("Transport (tonnes)","source_transport")\
                            .withColumnRenamed("Agriculture (tonnes)","source_agriculture")\
                            .withColumnRenamed("Forestry (tonnes)","source_forestry")\
                            .withColumnRenamed("Land use sources (tonnes)","source_land")\
                            .withColumnRenamed("Energy (tonnes)","source_energy")\
                            .drop("code")

In [10]:
carbonPerCapita = spark.read.option("inferSchema","true")\
                        .option("header","true")\
                        .csv(c.download+"/co-emissions-per-capita.csv")

In [11]:
carbonPerCapita = carbonPerCapita.withColumnRenamed("Entity","Country")\
                                .withColumnRenamed("Per capita CO₂ emissions (tonnes per capita)","annual_co2_emission_capita")\
                                .drop('code')

In [12]:
carbonAnnualShare = spark.read.option("inferSchema","true")\
                        .option("header","true")\
                        .csv(c.download+"/annual-share-of-co2-emissions.csv")

In [13]:
carbonAnnualShare = carbonAnnualShare.withColumnRenamed("Entity","Country")\
                                    .withColumnRenamed("Share of global CO₂ emissions (%)","annual_co2_share")\
                                    .drop('code')

In [14]:
temperatureDF.write.csv(c.staging+"/temperature","overwrite",header=True,sep=",")
carbonPerCountry.write.csv(c.staging+"/carbonPerCountry","overwrite",header=True,sep=",")
carbonPerEconomy.write.csv(c.staging+"/carbonPerEconomy","overwrite",header=True,sep=",")
carbonPerCapita.write.csv(c.staging+"/carbonPerCapita","overwrite",header=True,sep=",")
carbonAnnualShare.write.csv(c.staging+"/carbonAnnualShare","overwrite",header=True,sep=",")
gasPerSector.write.csv(c.staging+"/gasPerSector","overwrite",header=True,sep=",")