# Data description - UN voting
This jupyter notebook describes the data containing historical records of voting behavior within the United Nations. The dataset was gathered from the TidyTuesday github page and consists of three different CSV files that can easily be linked together.

#### Loading the packages

In [1]:
import pandas as pd
import datetime
import cdsapi
import netCDF4 as nc
import os
from tqdm import tqdm

##### Loading the data

In [2]:
votes = pd.read_csv('Data/votes.csv')
roll_calls = pd.read_csv('Data/roll_calls.csv')
roll_call_issues = pd.read_csv('Data/roll_call_issues.csv')

## Votes
The votes dataset contains the overview of the votes organized per resolution and country. It contains a total of 869 937 observations. For every observation, the reference ID for the resolution is given combined with the country, country_code and the vote.

In [3]:
votes.sample(5, random_state=12345)

Unnamed: 0,rcid,country,country_code,vote
449842,3664,Burkina Faso,BF,yes
281290,2526,Canada,CA,no
362394,3079,Uganda,UG,yes
180463,1807,Myanmar (Burma),MM,abstain
804741,5748,Madagascar,MG,yes


## Roll calls
The roll calls dataset contains information about the resolutions that are proposed and voted on. It contains 6 202 resolutions with dates rangin from January 1946 until December 2019. There is also an indication of whether the vote is an important one, and the dataset contains a short and long description of the resolution.

In [4]:
roll_calls.sample(2, random_state=13245)

Unnamed: 0,rcid,session,importantvote,date,unres,amend,para,short,descr
3398,3403,44,0.0,1989-11-04,R/44/27L,,,"APARTHEID, SPORTS",Support for the work of the Commission against...
1462,1465,28,0.0,1973-12-03,R/28/3110,0.0,0.0,"NON SELF GOVERNING TERRITORIES, INFORMATION","RES, I.A., DEPLORING FAILURE OF SOME MEMBER ST..."


In [5]:
def climateRelated(text: str) -> bool:
    text = text.lower()
    return ('climate change' in text) | ('global warming' in text) | ('renewable energy' in text)

roll_calls.loc[:, 'climate'] = roll_calls.descr.apply(
    lambda x: climateRelated(str(x)))

roll_calls[roll_calls.climate]

Unnamed: 0,rcid,session,importantvote,date,unres,amend,para,short,descr,climate
5479,5577,72,0.0,2017-12-20,A/RES/72/224,,,"Ensuring access to affordable, reliable, susta...","A/72/251 19i - Ensuring access to affordable, ...",True
5714,5812,62,,2007-12-10,R/62/86,,1.0,,The President: The Assembly has before it a dr...,True


## Roll call issues
The roll call issues dataset contains again the resolutions that are voted on but this time it describes the issue in just a couple of words. It only contains 5 745 rows which means that not every resolution will have a match with this dataset. Also, the issues listed have little to do with climate change.

In [6]:
roll_call_issues.sample(5, random_state=12345)

Unnamed: 0,rcid,short_name,issue
5626,5236,ec,Economic development
3101,1274,hr,Human rights
2325,3144,di,Arms control and disarmament
4541,2790,co,Colonialism
2190,2558,di,Arms control and disarmament


In [7]:
roll_call_issues.issue.value_counts()

Arms control and disarmament            1092
Palestinian conflict                    1061
Human rights                            1015
Colonialism                              957
Nuclear weapons and nuclear material     855
Economic development                     765
Name: issue, dtype: int64

## Copernicus data - Wildfires
The copernicus data is a comprehensive data source that contains data obtained through satelites. In this case, the dataset contains information about fire burned areas from 2001 until 2019. 

In [8]:
def constructDataFrame(url: str) -> pd.DataFrame:
    # Load the dataset using the url
    ds = nc.Dataset(url)
    # Get the date of the observations in the dataset
    date = datetime.datetime(1970, 1, 1) + datetime.timedelta(days=ds['time'][:][0])
    
    # Get the values for burned area
    burned_area = ds['burned_area'][:][0].flatten()
    
    # Get the standard error
    standard_error = ds['standard_error'][:][0].flatten()
    
    # Get the fraction
    fraction = ds['fraction_of_observed_area'][:][0].flatten()
    
    # Get longitudes
    lon = []
    for i in list(ds['lon'][:]):
        lon = lon + [i]*len(ds['lat'][:])
    
    # Get latitudes
    lat = list(list(ds['lat'][:]))*len(ds['lon'][:])
    
    
    # Convert information to a pandas dataframe
    df = pd.DataFrame(lon)
    df.columns = ['longitude']
    df.loc[:, 'latitude'] = lat
    df.loc[:, 'burned_area'] = burned_area
    df.loc[:, 'fraction'] = fraction
    df.loc[:, 'standard_error'] = standard_error
    df.loc[:, 'Date'] = date.strftime("%Y-%m-%d")
    df = df[df.burned_area != 0]
    return df

dataframes = []
files = []
for root, dirs, filelist in os.walk("Data/copernicus"):
    for filename in filelist:
        files.append('Data/copernicus/' + filename)

for file in tqdm(files):
    dataframes.append(constructDataFrame(file))

df = pd.concat(dataframes)
df[df.burned_area > 0].sample(5, random_state=12345)

100%|██████████| 228/228 [14:20<00:00,  3.77s/it]


Unnamed: 0,longitude,latitude,burned_area,fraction,standard_error,Date
471585,-16.375,-86.375,6654418.0,0.999439,1550628.0,2018-12-01
405677,-39.125,10.625,44488008.0,0.996123,920177.0,2002-03-01
630138,38.875,55.375,38316572.0,0.996786,1596904.0,2011-07-31
411524,-37.125,-11.125,1180622.0,0.985928,744979.0,2002-03-01
631625,39.375,43.625,5420131.0,1.0,1246401.0,2005-04-30


In [9]:
df.to_parquet('Data/copernicus.parquet.gzip', compression='gzip', index=False)

## Copernicus data - Heat waves
This dataset contains yearly heat wave information for the European Continent between 1986 and 2019.

In [10]:
ds = nc.Dataset('Data/HWD_national_rcp45_mean_v1.0.nc')

In [11]:
def constructDataset(ds):
    years = []
    for i in tqdm(range(34)):
        df = pd.DataFrame(ds['HWD_merged'][i].flatten())
        df.columns = ['heat_wave_days']
        df.loc[:, 'year'] = 1986 + i
        lon = []
        for i in list(ds['lon'][:]):
            lon = lon + [i]*len(ds['lat'][:])
        df.loc[:, 'longitude'] = lon
        lat = list(list(ds['lat'][:]))*len(ds['lon'][:])
        df.loc[:, 'latitude'] = lat
        df = df[['longitude', 'latitude', 'heat_wave_days', 'year']]
        df = df.fillna(0)
        years.append(df)
    return pd.concat(years)

df = constructDataset(ds)     
df[df.heat_wave_days > 0].sample(5, random_state=12345)

100%|██████████| 34/34 [00:18<00:00,  1.84it/s]


Unnamed: 0,longitude,latitude,heat_wave_days,year
173626,15.9,52.7,0.356855,2002
146995,9.6,67.1,0.169788,1998
222164,27.3,61.5,1.631315,1986
197486,21.5,58.7,0.027662,2019
156832,12.0,30.8,2.440187,1991


In [12]:
df.to_parquet('Data/heat_waves_eu.parquet.gzip', compression='gzip', index=False)

## Copernicus - Temperature and humidity

In [13]:
def constructData():
    ds_hum = nc.Dataset('Data/temperature/hurs_Amon_IPSL-CM5A-MR_historical_r3i1p1_185001-200512.nc')
    ds_prec = nc.Dataset('Data/temperature/pr_Amon_IPSL-CM5A-MR_historical_r3i1p1_185001-200512.nc')
    ds_wind = nc.Dataset('Data/temperature/sfcWind_Amon_IPSL-CM5A-MR_historical_r3i1p1_185001-200512.nc')
    ds_tas = nc.Dataset('Data/temperature/tas_Amon_IPSL-CM5A-MR_historical_r3i1p1_185001-200512.nc')
    ds_tasmax = nc.Dataset('Data/temperature/tasmax_Amon_IPSL-CM5A-MR_historical_r3i1p1_185001-200512.nc')
    ds_tasmin = nc.Dataset('Data/temperature/tasmin_Amon_IPSL-CM5A-MR_historical_r3i1p1_185001-200512.nc')
    dataframes=[]
    times = len(ds_hum['time'][:])
    for t in tqdm(range(times)):
        date = datetime.datetime(1850, 1, 1) + datetime.timedelta(days=ds_hum['time'][:][t])
        humidity = ds_hum['hurs'][t].flatten()
        precipation = ds_prec['pr'][t].flatten()
        wind_speed = ds_wind['sfcWind'][t].flatten()
        temperature = ds_tas['tas'][t].flatten()
        temperature_max = ds_tasmax['tasmax'][t].flatten()
        temperature_min = ds_tasmin['tasmin'][t].flatten()
        
        
        df = pd.DataFrame(humidity)
        df.columns = ['humidity']
        df.loc[:, 'precipation'] = precipation
        df.loc[:, 'wind_speed'] = wind_speed
        df.loc[:, 'temperature'] = temperature
        df.loc[:, 'temperature_max'] = temperature_max
        df.loc[:, 'temperature_min'] = temperature_min
        
        lon = []
        for i in list(ds_hum['lon'][:]):
            lon = lon + [i]*len(ds_hum['lat'][:])
        df.loc[:, 'longitude'] = lon
        
        lat = list(list(ds_hum['lat'][:]))*len(ds_hum['lon'][:])
        df.loc[:, 'latitude'] = lat
        
        df.loc[:, 'date'] = date.strftime("%Y-%m-%d")
        dataframes.append(df)
    return pd.concat(dataframes)

df = constructData()
df.sample(5, random_state=12345)

100%|██████████| 1872/1872 [01:36<00:00, 19.39it/s]


Unnamed: 0,humidity,precipation,wind_speed,temperature,temperature_max,temperature_min,longitude,latitude,date
9799,76.219597,6.1e-05,1.13698,299.336853,304.787354,294.883698,170.0,5.070423,1931-02-25
12875,84.152618,4.6e-05,5.565664,299.140778,300.115601,296.061646,225.0,-83.661972,2002-11-09
6793,85.242203,1.6e-05,5.357446,291.699127,294.537781,287.773468,117.5,1.267606,1999-11-10
714,93.10083,5e-06,8.193845,230.139465,241.230072,222.180267,10.0,90.0,1940-04-24
4840,85.73465,4.1e-05,9.172496,280.131653,282.376434,277.944519,82.5,63.380283,1981-07-15


In [14]:
df.to_parquet('Data/temperature_etc.parquet.gzip', compression='gzip', index=False)