In [28]:
import pandas as pd
import numpy as np

## reading & observing data

In [35]:
skip = 0
with open("20231299.auto_hr.txt", encoding="utf-8") as f:
    for line in f:
        if line.startswith("#"):
            print(line)
            break
        skip += 1
print(skip)

# stno yyyymmddhh     PS01     TX01     RH01     WD01     WD02     WD07     WD08     PP01     DY22     SS01     TS01     TS02     TS03     TS04     TS06     TS07     GR01     WD09     TG01

77


In [36]:
df = pd.read_fwf("20231299.auto_hr.txt", skiprows=skip)
df.head(3)

Unnamed: 0,# stno,yyyymmddhh,PS01,TX01,RH01,WD01,WD02,WD07,WD08,PP01,...,SS01,TS01,TS02,TS03,TS04,TS06,TS07,GR01,WD09,TG01
0,C0A520,2023120100,1018.1,18.3,93.0,3.5,55.0,8.8,79.0,0.5,...,,,,,,,,,2300.0,
1,C0A520,2023120101,1017.7,18.1,90.0,3.2,61.0,7.7,57.0,0.5,...,,,,,,,,,5300.0,
2,C0A520,2023120102,1017.6,18.2,94.0,4.3,55.0,8.2,71.0,1.0,...,,,,,,,,,3200.0,


In [37]:
df_rain = df[["# stno", "yyyymmddhh", "PP01"]].copy()
df_rain.rename(columns = {"# stno": "stno"}, inplace = True)
df_rain.head(5)

Unnamed: 0,stno,yyyymmddhh,PP01
0,C0A520,2023120100,0.5
1,C0A520,2023120101,0.5
2,C0A520,2023120102,1.0
3,C0A520,2023120103,1.0
4,C0A520,2023120104,3.0


In [10]:
df_rain.shape

(469170, 3)

In [None]:
irregular = [-999.1, -9.6, -999.6, -9.5, -99.5, -999.5, -9999.5, -9.7, -99.7, -999.7, -9999.7, -9.8]
for i in irregular:
    print(f"{i}: ",df_rain["PP01"].isin([i]).sum())
print("na: ",df_rain["PP01"].isna().sum())


-999.1:  4464
-9.6:  0
-999.6:  13715
-9.5:  0
-99.5:  0
-999.5:  189
-9999.5:  0
-9.7:  0
-99.7:  0
-999.7:  0
-9999.7:  0
-9.8:  0
na:  1176


In [23]:
df_rain.dtypes

stno           object
yyyymmddhh      int64
PP01          float64
dtype: object

## Data processing

In [38]:
df_rain["yyyymmddhh"] = pd.to_datetime(df["yyyymmddhh"].astype(str), format="%Y%m%d%H")
df_rain["yyyymmddhh"].dtypes

dtype('<M8[ns]')

In [46]:
missing = [-999.1, -999.5, -999.6]
df_rain["PP01_clean"] = df_rain["PP01"]
df_rain["PP01_clean"].replace(missing, np.nan, inplace=True)
df_rain["PP01_clean"].isna().sum() == 4464 + 13715 + 189 + 1176

True

### PP01_accumulated

In [51]:
df_rain["PP01_accumulated"] = df_rain["PP01"].isin([-999.6])

### PP01_accumulated_window

In [55]:
df_rain["PP01_accumulated_window"] = df_rain["PP01_accumulated"]
for i in range(0, len(df_rain)-1):
    if df_rain.loc[i, "PP01"] == -999.6 and df_rain.loc[i+1, "PP01"] != 0:
        df_rain.loc[i+1, "PP01_accumulated_window"] = True

In [None]:
df_rain.iloc[3,0:]

stno                                    C0A520
yyyymmddhh                 2023-12-01 03:00:00
PP01                                       1.0
PP01_clean                                 1.0
PP01_accumulated                         False
PP01_accumulated_window                  False
Name: 3, dtype: object

In [66]:
# check accumalated window
idx = df_rain.index[df_rain["PP01"] == -999.6]
for i in idx[0:3]:
    start = i - 2
    end = i + 2
    print(df_rain.loc[start:end, ["PP01", "PP01_accumulated", "PP01_accumulated_window"]])

     PP01  PP01_accumulated  PP01_accumulated_window
11    0.0             False                    False
12    0.0             False                    False
13 -999.6              True                     True
14    0.0             False                    False
15    0.0             False                    False
     PP01  PP01_accumulated  PP01_accumulated_window
33    0.0             False                    False
34    0.0             False                    False
35 -999.6              True                     True
36    0.0             False                    False
37    0.0             False                    False
      PP01  PP01_accumulated  PP01_accumulated_window
126    0.0             False                    False
127    0.0             False                    False
128 -999.6              True                     True
129 -999.6              True                     True
130    1.5             False                     True


In [67]:
df_rain.to_csv("rain_processed_2023.csv", index=False)