In [161]:
import numpy as np
import pandas as pd
import csv
import random

In [162]:
#reading in data from different sources
sea_ice = pd.read_csv('data/seaice_to_process.csv')
global_temp = pd.read_csv('data/GlobalTemperatures_to_process.csv')
co2 = pd.read_csv('data/daily_in_situ_co2_mlo_to_process.csv')
natural_dis = pd.read_csv('data/natural_disasters_to_process.csv')

In [163]:
#pre-processing sea_ice data
sea_ice = sea_ice.drop(columns=['hemisphere', '    Missing', ' Source Data'], axis=1)
sea_ice = sea_ice.dropna()
sea_ice.head()

Unnamed: 0,Year,Month,Day,Extent
0,1978,10,26,10.231
1,1978,10,28,10.42
2,1978,10,30,10.557
3,1978,11,1,10.67
4,1978,11,3,10.777


In [164]:
#pre-processing average monthly temperature data
global_temp = global_temp.drop(columns=['LandAverageTemperatureUncertainty'], axis=1)
global_temp = global_temp.dropna()
global_temp.head()

Unnamed: 0,dt,LandAverageTemperature
0,11/1/77,6.544
1,12/1/77,3.749
2,1/1/78,2.705
3,2/1/78,3.456
4,3/1/78,5.607


In [165]:
#pre-processing CO2 data
co2 = co2.drop(columns=[' NB', ' scale'], axis=1)
co2 = co2.rename(columns={"% Yr": "year", " Mn": "month", ' Dy': "day", '    CO2': "co2"})
co2 = co2.dropna()
co2.head()

Unnamed: 0,year,month,day,co2
0,1958,1,1,
1,1958,1,2,
2,1958,1,3,
3,1958,1,4,
4,1958,1,5,


In [166]:
#pre-processing natural disaster data
natural_dis = natural_dis.fillna(-1)
natural_dis.head()

Unnamed: 0,Disaster Type,Start Year,Start Month,Start Day,End Year,End Month,End Day
0,Earthquake,1977,8.0,19.0,1977,8.0,19.0
1,Drought,1977,5.0,-1.0,1977,-1.0,-1.0
2,Drought,1977,5.0,-1.0,1977,-1.0,-1.0
3,Earthquake,1977,11.0,23.0,1977,11.0,23.0
4,Flood,1977,1.0,-1.0,1977,1.0,-1.0


In [167]:
#creating a date (year and month) to average monthly temperature dictionary
date_to_temp = {}

for i, r in global_temp.iterrows():
    date_split = r['dt'].split('/')
    date_to_temp[date_split[0] + '-' + date_split[2]] = r['LandAverageTemperature']
    
print(len(date_to_temp))
date_to_temp

458


{'11-77': 6.544,
 '12-77': 3.7489999999999997,
 '1-78': 2.705,
 '2-78': 3.4560000000000004,
 '3-78': 5.607,
 '4-78': 8.791,
 '5-78': 11.414000000000001,
 '6-78': 13.22,
 '7-78': 14.364,
 '8-78': 13.297,
 '9-78': 12.03,
 '10-78': 9.339,
 '11-78': 6.35,
 '12-78': 3.74,
 '1-79': 2.679,
 '2-79': 2.841,
 '3-79': 5.474,
 '4-79': 8.455,
 '5-79': 11.199000000000002,
 '6-79': 13.487,
 '7-79': 14.114,
 '8-79': 13.833,
 '9-79': 12.247,
 '10-79': 9.586,
 '11-79': 6.2860000000000005,
 '12-79': 4.6,
 '1-80': 2.9560000000000004,
 '2-80': 3.6519999999999997,
 '3-80': 5.367000000000001,
 '4-80': 8.935,
 '5-80': 11.77,
 '6-80': 13.582,
 '7-80': 14.742,
 '8-80': 14.189,
 '9-80': 12.321,
 '10-80': 9.55,
 '11-80': 6.684,
 '12-80': 4.016,
 '1-81': 3.785,
 '2-81': 4.021,
 '3-81': 6.239,
 '4-81': 8.953,
 '5-81': 11.565,
 '6-81': 13.765,
 '7-81': 14.738,
 '8-81': 14.502,
 '9-81': 12.235999999999999,
 '10-81': 9.394,
 '11-81': 6.273,
 '12-81': 4.519,
 '1-82': 2.5580000000000003,
 '2-82': 3.281,
 '3-82': 4.873,


In [168]:
#creating a date (year, month, and day) to sea_ice extent dictionary
date_to_ice = {}

for i, r in sea_ice.iterrows():
    date = str(r[' Month'])[:-2] + '-' + str(r[' Day'])[:-2] + '-' + str(r['Year'])[:-2]
    date_to_ice[date] = r['     Extent']
    
print(len(date_to_ice))
date_to_ice

13177


{'10-26-1978': 17.624000000000002,
 '10-28-1978': 17.803,
 '10-30-1978': 17.67,
 '11-1-1978': 17.527,
 '11-3-1978': 17.486,
 '11-5-1978': 17.343,
 '11-7-1978': 17.157,
 '11-9-1978': 17.028,
 '11-11-1978': 16.815,
 '11-13-1978': 16.371,
 '11-15-1978': 16.176,
 '11-17-1978': 15.52,
 '11-19-1978': 15.267000000000001,
 '11-21-1978': 15.027000000000001,
 '11-23-1978': 14.681,
 '11-25-1978': 14.449000000000002,
 '11-27-1978': 13.98,
 '11-29-1978': 13.619000000000002,
 '12-1-1978': 13.33,
 '12-3-1978': 13.097000000000001,
 '12-5-1978': 12.829,
 '12-7-1978': 12.436,
 '12-9-1978': 11.963,
 '12-11-1978': 11.652000000000001,
 '12-13-1978': 11.148,
 '12-15-1978': 10.664000000000001,
 '12-17-1978': 10.238999999999999,
 '12-19-1978': 9.758,
 '12-21-1978': 9.362,
 '12-23-1978': 8.706,
 '12-25-1978': 8.362,
 '12-27-1978': 7.907,
 '12-29-1978': 7.587999999999999,
 '12-31-1978': 7.2829999999999995,
 '1-2-1979': 6.945,
 '1-4-1979': 6.837999999999999,
 '1-6-1979': 6.638,
 '1-8-1979': 6.27,
 '1-10-1979': 6

In [169]:
#creating a date (year, month, and day) to CO2 dictionary
date_to_co2 = {}

for i, r in co2.iterrows():
    if "NaN" not in r['co2'] and r['year'] > 1977:
        date = str(r['month']) + '-' + str(r['day']) + '-' + str(r['year'])
        date_to_co2[date] = float(r['co2'])
    
print(len(date_to_co2))
date_to_co2

12097


{'1-1-1978': 334.39,
 '1-2-1978': 335.58,
 '1-3-1978': 335.39,
 '1-4-1978': 335.22,
 '1-5-1978': 334.31,
 '1-6-1978': 334.2,
 '1-7-1978': 334.1,
 '1-8-1978': 333.65,
 '1-9-1978': 334.17,
 '1-10-1978': 334.09,
 '1-11-1978': 334.81,
 '1-12-1978': 334.56,
 '1-15-1978': 334.57,
 '1-16-1978': 334.6,
 '1-17-1978': 335.06,
 '1-18-1978': 335.98,
 '1-19-1978': 334.78,
 '1-20-1978': 334.85,
 '1-22-1978': 336.06,
 '1-23-1978': 334.48,
 '1-26-1978': 334.66,
 '1-27-1978': 336.5,
 '1-28-1978': 336.63,
 '1-29-1978': 334.49,
 '1-30-1978': 334.96,
 '1-31-1978': 335.78,
 '2-1-1978': 335.53,
 '2-2-1978': 335.44,
 '2-3-1978': 335.6,
 '2-4-1978': 335.39,
 '2-5-1978': 334.52,
 '2-6-1978': 334.35,
 '2-7-1978': 334.57,
 '2-8-1978': 334.65,
 '2-9-1978': 334.92,
 '2-11-1978': 335.33,
 '2-12-1978': 335.24,
 '2-14-1978': 335.08,
 '2-15-1978': 335.78,
 '2-16-1978': 335.81,
 '2-17-1978': 336.15,
 '2-19-1978': 334.96,
 '2-20-1978': 335.19,
 '2-21-1978': 334.97,
 '2-22-1978': 335.32,
 '2-23-1978': 336.01,
 '2-24-1978

In [170]:
#determining all possible dates that have required data
dates = []

co2_dates = date_to_co2.keys()
ice_dates = date_to_ice.keys()

for d in co2_dates:
    if d in ice_dates:
        dates.append(d)

print(len(dates))

9938


In [159]:
#picking 2,205 random days from possible 10,000 days that have required data
rands = random.sample(range(0, len(dates)), 2205)
dates_fnl = []

for r in rands:
    dates_fnl.append(dates[r])
    
len(dates_fnl)

2500

In [154]:
#creating a date (year, month, and day) to number of natural disasters dictionary for only 2,500 randomly choosen days
date_to_num_dis = []
counter = 0

for d in dates_fnl:
    num_dis = 0
    date_split = d.split('-')
    day = int(date_split[0])
    mn = int(date_split[1])
    yr = int(date_split[2])
    
    for i, r in natural_dis.iterrows():
        st_day = int(r['Start Day'])
        st_mn = int(r['Start Month'])
        st_yr = int(r['Start Year'])
        
        end_day = int(r['End Day'])
        end_mn = int(r['End Month'])
        end_yr = int(r['End Year'])
        
        if yr > st_yr and yr < end_yr:
            num_dis += 1
            
        elif yr == st_yr:
            if st_mn == -1:
                num_dis += 1
                
            elif mn > st_mn:
                num_dis += 1
                
            elif mn == st_mn:
                if st_day == -1:
                    num_dis += 1
                
                elif day >= st_day:
                    num_dis += 1
        
        
        elif yr == end_yr:
            if end_mn == -1:
                num_dis += 1
                
            elif mn < end_mn:
                num_dis += 1
                
            elif mn == end_mn:
                if end_day == -1:
                    num_dis += 1
                
                elif day <= end_day:
                    num_dis += 1
        
    date_to_num_dis.append({'date': d, 'num_disasters': num_dis})
    counter += 1
    if counter % 500 == 0:
        print(counter)

date_to_num_dis

KeyboardInterrupt: 

In [180]:
#creating final data frame with all data for each 2,500 days
df = pd.DataFrame({'date': [], 'avg_temp': [], 'co2': [], 'sea_ice_extent': [], 'num_natural_disasters': []})

for d in dates_fnl:
    date_split = d.split('-')
    
    if int(date_split[2]) < 2016:
        
    
        new_row = {'date': d, 'avg_temp': '', 'co2': '', 'sea_ice_extent': '', 'num_natural_disasters': ''}
        new_row['avg_temp'] = date_to_temp[date_split[0] + '-' + date_split[2][2:]]
        new_row['co2'] = date_to_co2[d]
        new_row['sea_ice_extent'] = date_to_ice[d]
        new_row['num_natural_disasters'] = date_to_num_dis[d]
    
        df = df.append(new_row, ignore_index = True)

In [181]:
len(df)

2205

In [182]:
df.head()

Unnamed: 0,date,avg_temp,co2,sea_ice_extent,num_natural_disasters
0,3-9-2014,6.378,398.87,4.285,391.0
1,8-5-1989,14.135,352.54,16.916,172.0
2,7-27-1985,13.973,345.83,16.181,268.0
3,6-21-1992,13.576,358.17,13.2,395.0
4,12-10-1999,4.565,368.06,12.357,554.0


In [184]:
df.to_csv('data/2205_randomly_choosen_days_with_data.csv')