# Scraping temperature anomaly data
This notebook discusses all the steps that are used to scrape the temperature anomaly data used in the application.

In [5]:
import pandas as pd
import numpy as np
import datetime

import xarray as xr
import io
import urllib.request
import gzip

#import geopandas as gpd
import os
import zipfile as zf
#import shapefile
#from shapely.geometry import shape

### Importing data

Importing file with world country borders as coordinates

In [3]:
zp = zf.ZipFile('data/TM_WORLD_BORDERS-0.3.zip')
files_to_read = [y for y in zp.namelist() for ending in ['dbf', 'prj', 'shp', 'shx'] if y.endswith(ending)]
dummy = zp.read(files_to_read[0])
dbf_file, prj_file, shp_file, shx_file = [zp.open(filename) for filename in files_to_read]
r = shapefile.Reader(shp = shp_file, shx = shx_file, dbf = dbf_file, encoding='windows-1252')
attributes, geometry = [], []
field_names = [field[0] for field in r.fields[1:]]
for row in r.shapeRecords():
    geometry.append(shape(row.shape.__geo_interface__))
    attributes.append(dict(zip(field_names, row.record)))
#Creating a GeoDataframe of the World Borders
gdf = gpd.GeoDataFrame(data = attributes, geometry = geometry)

Scraping temperature anomlalies from NASA dataset: https://data.giss.nasa.gov/gistemp/

In [4]:
url = "https://data.giss.nasa.gov/pub/gistemp/gistemp1200_GHCNv4_ERSSTv5.nc.gz"
req = urllib.request.Request(url)
with gzip.open(urllib.request.urlopen(req)) as resp:
    xr_df = xr.open_dataset(io.BytesIO(resp.read()))
dfnasa = xr_df.to_dataframe()
#Transforming into pandas dataframe
dfnasa = dfnasa.reset_index()

### Data Manipulation

Set year from which to obtain data and country

In [5]:
year = 1947
iso3= "USA"

In [6]:
def tempdata(year, iso3):
    #Subsetting the year range from the provided year
    tempyear = dfnasa[dfnasa['time'].dt.year >= year]
    #Creating a GeoDataFrame
    tempyear = gpd.GeoDataFrame(tempyear, geometry=gpd.points_from_xy(tempyear.lon, tempyear.lat))
    #Merging with the border data
    bord = gpd.sjoin(gdf, tempyear, how="inner")
    yearcountry = bord[bord["ISO3"]==iso3]
    #Sorting values by time
    yearcountry = yearcountry.sort_values(by=["time"])
    yearcountry = pd.DataFrame(yearcountry)
    yearcountry = yearcountry.drop(columns= ["FIPS", "UN", "AREA","POP2005", "index_right", "nv"])
    yearcountry["YearMonth"]=yearcountry["time"].dt.strftime("%Y-%m")
    del tempyear
    return yearcountry

Creating DataFrame for selected country and year with country name and ISO3 code added

In [7]:
df = tempdata(year,iso3)
df

MemoryError: Unable to allocate 220. MiB for an array with shape (28900800,) and data type object

Create DataFrame with only Max and Min temperatures

In [12]:
df2 = (df.assign(Data_Value=df['tempanomaly'].abs())
       .groupby(['time'])['tempanomaly'].agg([('Min' , 'min'), ('Max', 'max')])
       .add_prefix('time'))
df2 = df2.reset_index()

In [14]:
df2

Unnamed: 0,time,timeMin,timeMax,gdp
0,1947-01-15,-6.66,4.33,2033.061000
1,1947-02-15,-3.48,3.51,2031.253667
2,1947-03-15,-3.64,4.59,2029.446333
3,1947-04-15,-1.90,1.61,2027.639000
4,1947-05-15,-2.55,2.46,2026.243333
...,...,...,...,...
887,2020-12-15,-0.27,5.58,18989.854000
888,2021-01-15,0.48,7.10,19087.568000
889,2021-02-15,-4.14,1.27,
890,2021-03-15,-1.28,5.49,


Export to pickle

In [15]:
df2.to_pickle("Temperature_anomalies_"+str(year)+"_"+iso3+".pkl")