In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import tqdm
from statsmodels.stats.descriptivestats import sign_test

: 

In [None]:
# Load data
df = pd.read_csv("filtered_data.csv")
print(df.shape)
df

: 

In [None]:
# ===========================================================
# Compute each station's annual mean temperature
# ===========================================================

# For each station and for each day, compute the midpoint temperature by
# averaging the min and max temperatures
mid_temps = df.where(np.logical_or(df["element"] == "TMIN",
                            df["element"] == "TMAX")).groupby(by=["station_id", "year", "month", "day"]).mean().reset_index()

# For each station and for each year, compute the average temperature across that year
temps = mid_temps.groupby(by=["station_id", "year"]).mean()["value"].reset_index()

# All temperatures are in tenths of degree Celsius, so divide by 10 to get
# actual Celsius temperatures
temps["value"] /= 10

# Convert year to int type
temps["year"] = temps["year"].astype(int)

temps = temps.set_index(["station_id", "year"]).squeeze()
temps.rename("temp", inplace=True)
print(temps.shape)
temps

: 

In [6]:
# ===========================================================
# Compute each year's metrics (e.g. precipitation, snowfall)
# ===========================================================

def compute_yearly_metrics(metrics):
    metrics_data = []
    for metric in tqdm.tqdm(metrics):
        # Extract the data
        data = df.where(df['element'] == metric).dropna()

        # Delete missing data, and unnecessary columns
        data = data[data["value"] != -9999]

        # For each station and for each year, compute the average metric across that year
        result = data.groupby(by=["station_id", "year"]).mean()["value"].reset_index()

        # Convert year to int type
        result["year"] = result["year"].astype(int)

        result = result.set_index(["station_id"]).squeeze()
        result.rename(columns={"value": metric}, inplace=True)
        print(result.shape)
        
        metrics_data.append(result)

    return metrics_data

# Join the temperature and metric data (on the index a.k.a. the year column)
metrics_list = ["PRCP", "SNOW"]
data = [temps] + compute_yearly_metrics(metrics_list)
data = pd.concat(data, axis=1, join="inner", ignore_index=False)
data

  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Rearrange the structure of the data to have each station as a row, years as columns, and metrics as depth
# e.g.
#             1941   1942   1943
# station A
# station B
# station C

all_years = data["year"].unique()
all_adj_years = np.lib.stride_tricks.sliding_window_view(all_years, 2)
p_values = np.empty(all_adj_years.shape[0])
i = 0
for (year1, year2) in tqdm.tqdm(all_adj_years):
    # Get station data for both years
    data_year1 = data[data["year"] == year1]
    data_year2 = data[data["year"] == year2]
    print(data_year1, "\n ----")
    print(data_year2)

    # Get specific columns
    temp_year1 = data_year1["temp"]
    temp_year2 = data_year2["temp"]
    prcp_year1 = data_year1["PRCP"]
    prcp_year2 = data_year2["PRCP"]
    snow_year1 = data_year1["SNOW"]
    snow_year2 = data_year2["SNOW"]
    
    # Sign test of temperature across both years
    print(temp_year1.shape)
    print(temp_year2.shape)
    print("temp_year1 =", temp_year1)
    print("temp_year2 =", temp_year2)
    _, p_values[i] = sign_test(temp_year2 - temp_year1)

    i += 1
    break

  0%|          | 0/144 [00:00<?, ?it/s]

       station_id  year       temp        PRCP       SNOW
0     CA001012046  1941  12.177250   58.684066   0.139726
107   CA002400400  1941 -14.805616    5.821918   4.131507
358   CA006012198  1941  -4.366556   15.960265   7.132450
471   CA006046588  1941   2.437972   22.481132   7.363208
710   CA008104500  1941   5.002466   29.956164   6.361644
987   USC00025274  1941  19.538430   11.421918   0.000000
1055  USC00029464  1941  19.628219   13.471233   0.000000
1109  USC00032148  1941  18.267397   29.608219   0.013699
1252  USC00043182  1941  10.064426   18.920548   1.186047
1347  USC00044705  1941   4.919726   18.630372  35.743802
1389  USC00045449  1941   9.762740   67.151125  12.353591
1489  USC00048099  1941  16.866575   32.060345   0.419355
1549  USC00058100  1941   6.538904    9.467033   1.802740
1645  USC00104670  1941  10.690411    5.679452   0.000000
1732  USC00106295  1941  12.180501    9.098630   0.722628
1849  USC00135517  1941  11.480632   24.679452   2.024658
1964  USC00146




In [None]:
print(p_values)

[1.00000000e+000 6.89824619e-310 9.55856789e-316 9.55856789e-316
 9.55920573e-316 1.27734646e-152 0.00000000e+000 3.95252517e-323
 8.48798316e-314 1.48510186e+161 9.75025278e+199 1.02189773e-152
 1.50946622e+161 3.62483420e+228 7.10583942e+159 8.82883670e+199
 2.60988457e+180 3.39827832e-085 6.01347008e-154 6.01347002e-154
 4.56335229e-072 3.29824944e-033 2.87504620e+161 6.88960211e-140
 9.08367203e+223 2.87504620e+161 2.66885567e-260 9.27994677e+242
 2.86530675e+161 4.39348350e+140 4.35688714e-114 3.43744702e+175
 2.32065361e-152 1.17118988e+166 3.50125689e+151 6.65343611e+246
 1.01260843e+267 1.07033502e+200 4.35182066e+276 3.01478165e-110
 5.55603593e+180 2.46695025e+179 1.91078595e+214 1.50953405e+161
 4.24819624e+180 2.91505687e-014 2.08707803e-115 2.79966899e+275
 7.26580490e+223 3.28539008e+238 1.10446677e+161 1.28514666e+248
 1.69264133e-139 2.87504688e+161 3.98450089e+252 3.43531510e+228
 1.28514666e+248 3.98449982e+252 0.00000000e+000 0.00000000e+000
 0.00000000e+000 6.153787