In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ghcn import load_daily
from glob import glob
import tqdm

In [2]:
# Data files
files = sorted(glob('ghcnd_small/*.dly'))

In [3]:
# Load data about stations (we only the station ID and latitude)
# that are in the northern hemisphere (a.k.a. latitude > 0)
north_stations = pd.read_fwf("ghcnd-stations.txt", header=None, usecols=[0, 1])
north_stations.columns = ["station_id", "latitude"]
north_stations = north_stations[north_stations["latitude"] > 0]
north_stations.set_index("station_id", inplace=True)

# We only care about stations in the northern hemisphere (latitude > 0)
# So go through the list of files, and only load files corresponding to
# stations in the northern hemisphere.
data_all = []
for filename in tqdm.tqdm(files):
    # Get the station name from the filename
    station_name = filename.split(sep='/')[-1][:-4]
    # Get the latitude of the station if it's in the dictionary (if not, it's -1)
    try:
        latitude = north_stations['latitude'][station_name]
    except:
        latitude = -1
    # Only load this file if the station is in the northern hemisphere (a.k.a. having latitude > 0)
    if latitude < 0:
        continue
    
    # All the data for one station
    df = pd.DataFrame.from_records(load_daily(filename))

    # Extract the temperature data
    filter_ = np.logical_or(np.logical_or(np.logical_or(
                                        df["element"] == "TMIN",
                                        df["element"] == "TMAX"), 
                                        df["element"] == "PRCP"), 
                                        df["element"] == "SNOW")
    
    temperatures = df[filter_]

    # Delete unnecessary columns
    temperatures = temperatures.drop(columns=["measurement", "quality", "source"])

    data_all.append(temperatures)

    del filename, station_name, latitude, df, filter_, temperatures
print("Reading files... DONE")

# Combine all the dataframes
data_all = pd.concat(data_all)
print("Dataframe shape:", data_all.shape)

# Delete missing data
data_all = data_all[data_all["value"] != -9999]
print(data_all.groupby(by=["station_id"]).mean())

100%|██████████| 1000/1000 [02:09<00:00,  7.74it/s]


Reading files... DONE
Dataframe shape: (19710389, 6)
                    year     month        day       value
station_id                                               
AGM00060490  1993.621581  6.514315  15.718941  134.879319
AJ000037639  1972.504635  6.499854  15.683189   35.793834
AM000037686  1950.000470  6.511341  15.733590   40.432262
ARM00087909  1998.385791  6.549853  15.732351   66.152036
ASN00001031  2007.519351  6.528224  15.646321   36.460456
...                  ...       ...        ...         ...
VE000002099  1981.674108  6.614486  15.744466   35.670528
VMM00048808  2001.886210  6.469871  15.746934  166.299606
VMW00041041  1967.000000  5.083333  16.642857   43.095238
WA005270170  1977.236348  6.539596  15.737489    4.776119
ZA000067753  1969.957765  6.532808  15.434487   22.473569

[985 rows x 4 columns]


In [4]:
data_all

Unnamed: 0,station_id,year,month,element,day,value
0,AGM00060490,1957,1,TMAX,1,178
1,AGM00060490,1957,1,TMAX,2,150
2,AGM00060490,1957,1,TMAX,3,161
3,AGM00060490,1957,1,TMAX,4,172
4,AGM00060490,1957,1,TMAX,5,172
...,...,...,...,...,...,...
15215,ZA000067753,1990,11,PRCP,26,0
15216,ZA000067753,1990,11,PRCP,27,0
15217,ZA000067753,1990,11,PRCP,28,0
15218,ZA000067753,1990,11,PRCP,29,0


Things to analyze

* We can do a two-sample t-test between yearly difference in temperature and yearly difference in precipitation. That way we can see that if difference in temperature increases, whether or not difference in precipitation increases by chance. --> Do this for every climate indicator, not just precipitation
* Question: do we really need to use the differences between values, or can we just use the values themselves?
* Look into ozone depletion and CFCs getting banned in 1970s?