In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

In [3]:
# Load data
df = pd.read_csv("filtered_data.csv")
print(df.shape)
df

(18322712, 7)


Unnamed: 0.1,Unnamed: 0,station_id,year,month,element,day,value
0,0,AGM00060490,1957,1,TMAX,1,178
1,1,AGM00060490,1957,1,TMAX,2,150
2,2,AGM00060490,1957,1,TMAX,3,161
3,3,AGM00060490,1957,1,TMAX,4,172
4,4,AGM00060490,1957,1,TMAX,5,172
...,...,...,...,...,...,...,...
18322707,15215,ZA000067753,1990,11,PRCP,26,0
18322708,15216,ZA000067753,1990,11,PRCP,27,0
18322709,15217,ZA000067753,1990,11,PRCP,28,0
18322710,15218,ZA000067753,1990,11,PRCP,29,0


In [4]:
# ===========================================================
# Compute each year's temperature
# ===========================================================

# For each station and for each day, compute the midpoint temperature by
# averaging the min and max temperatures
mid_temps = df.where(np.logical_or(df["element"] == "TMIN",
                            df["element"] == "TMAX")).groupby(by=["station_id", "year", "month", "day"]).mean().reset_index()

# For each station and for each year, compute the average temperature across that year
avg_yearly_temps_per_station = mid_temps.groupby(by=["station_id", "year"]).mean()["value"].reset_index()

# For each year, compute the average temperature across stations
avg_yearly_temps = mid_temps.groupby(by=["year"]).mean()["value"].reset_index()

# All temperatures are in tenths of degree Celsius, so divide by 10 to get
# actual Celsius temperatures
avg_yearly_temps["value"] /= 10

In [5]:
avg_yearly_temps["year"] = avg_yearly_temps["year"].astype(int)
avg_yearly_temps = avg_yearly_temps.set_index("year").squeeze()
print(avg_yearly_temps.shape)
avg_yearly_temps


(145,)


year
1877     8.820109
1878     7.758888
1879     3.143174
1880     3.977254
1881     4.746990
          ...    
2017    10.677159
2018    10.307024
2019    10.116904
2020    10.444746
2021     2.146452
Name: value, Length: 145, dtype: float64

In [6]:
# ===========================================================
# Compute each year's precipitation
# ===========================================================

# Extract the precipitation data
prcp = df.where(df['element'] == 'PRCP').dropna()

# Delete missing data, and unnecessary columns
prcp = prcp[prcp["value"] != -9999]

# For each station and for each year, compute the average temperature across that year
avg_yearly_prcp_per_station = prcp.groupby(by=["station_id", "year"]).mean()["value"].reset_index()

# For each year, compute the average temperature across stations
avg_yearly_prcp = avg_yearly_prcp_per_station.groupby(by=["year"]).mean()["value"].reset_index()


In [7]:
avg_yearly_prcp["year"] = avg_yearly_prcp["year"].astype(int)
avg_yearly_prcp = avg_yearly_prcp.set_index("year").squeeze()
print(avg_yearly_prcp.shape)
avg_yearly_prcp

(172,)


year
1850    11.000000
1851    22.676712
1852    25.000000
1853    19.131507
1854    17.610959
          ...    
2017    36.918720
2018    40.338285
2019    36.929213
2020    37.480173
2021    30.426926
Name: value, Length: 172, dtype: float64

In [10]:
# Join the temperature and precipitation data (on the index a.k.a. the year column)
temp_prcp = pd.concat((avg_yearly_temps, avg_yearly_prcp), axis=1, join="inner", ignore_index=False)
temp_prcp.columns = ["temp", "prcp"]
temp_prcp

Unnamed: 0_level_0,temp,prcp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1877,8.820109,15.316494
1878,7.758888,21.908112
1879,3.143174,18.989336
1880,3.977254,14.674011
1881,4.746990,16.090796
...,...,...
2017,10.677159,36.918720
2018,10.307024,40.338285
2019,10.116904,36.929213
2020,10.444746,37.480173


In [16]:
# ===========================================================
# Compute % differences in temperature and precipitation
# year-to-year
# ===========================================================
changes = temp_prcp.pct_change() * 100
changes.drop(labels=changes.index[0], inplace=True) # drop the first row because it's just NaN values
changes

Unnamed: 0_level_0,temp,prcp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1878,-12.031835,43.036077
1879,-59.489373,-13.322807
1880,26.536234,-22.724992
1881,19.353461,9.655060
1882,7.381843,7.652449
...,...,...
2017,-2.317306,-5.782388
2018,-3.466605,9.262415
2019,-1.844565,-8.451207
2020,3.240532,1.491936


In [19]:
# Run a two-sample t-test to test between whether there's a difference in means of the percent changes for temperature and precipitation
stats.ttest_ind(changes["temp"], changes["prcp"], equal_var=False, alternative="two-sided")

Ttest_indResult(statistic=-0.37302822906556166, pvalue=0.7094249075311084)

Things to analyze

* We can do a two-sample t-test between yearly difference in temperature and yearly difference in precipitation. That way we can see that if difference in temperature increases, whether or not difference in precipitation increases by chance. --> Do this for every climate indicator, not just precipitation
* Question: do we really need to use the differences between values, or can we just use the values themselves?
* Look into ozone depletion and CFCs getting banned in 1970s?