In [1]:
import pandas as pd
import jalali

def clean_jalali_date(x):
    y, m, d = x.split('-')
    return f"{y.zfill(4)}-{m.zfill(2)}-{d.zfill(2)}"
    

# Load Data

In [2]:
data = pd.read_csv('raw_data.csv', sep=';')

data.columns = [
    "StationName", "StationID", 
    "Long", "Lat", "Elevation", "Date", 
    "Tmax", "Tmin", "Tmean", "Precip"             
]

data["StationName"] = data["StationName"].str.strip().str.upper()

# Extract Geo Information

In [3]:
info = data[["StationName", "StationID", "Long", "Lat", "Elevation"]]\
    .drop_duplicates(["StationName", "StationID"])\
        .sort_values(by=["StationName", "StationID"])\
            .reset_index(drop=True)
            
info.to_csv("StationsData/GeoInfo.csv", index=False)

# Data Cleaning

In [4]:
data = data[["StationName", "StationID", "Date", "Tmax", "Tmin", "Tmean", "Precip"]]
data["Date"] = pd.to_datetime(data["Date"])
data['Date'] = data['Date'].dt.strftime('%Y-%m-%d')
data["DateJalali"] = data["Date"].apply(lambda x: jalali.Gregorian(x).persian_string())
data["DateJalali"] = data["DateJalali"].apply(lambda x: clean_jalali_date(x))
data.sort_values(by=["StationName", "Date"], inplace=True)
data.reset_index(drop=True, inplace=True)

data

Unnamed: 0,StationName,StationID,Date,Tmax,Tmin,Tmean,Precip,DateJalali
0,ABADAN,40831.0,2000-01-01,21.4,7.4,13.5,0.0,1378-10-11
1,ABADAN,40831.0,2000-01-02,22.0,7.2,13.3,0.0,1378-10-12
2,ABADAN,40831.0,2000-01-03,23.4,7.0,14.7,0.0,1378-10-13
3,ABADAN,40831.0,2000-01-04,22.0,10.6,16.8,0.0,1378-10-14
4,ABADAN,40831.0,2000-01-05,18.8,11.2,16.8,0.0,1378-10-15
...,...,...,...,...,...,...,...,...
4085815,ZOHAN,19161.0,2005-12-27,7.2,-3.2,2.0,,1384-10-06
4085816,ZOHAN,19161.0,2005-12-28,5.2,-4.0,0.6,,1384-10-07
4085817,ZOHAN,19161.0,2005-12-29,-1.0,-6.2,-3.6,,1384-10-08
4085818,ZOHAN,19161.0,2005-12-30,-2.0,-7.0,-4.5,,1384-10-09


In [17]:
info[info.duplicated(['StationName'], keep=False)]

Unnamed: 0,StationName,StationID,Long,Lat,Elevation
8,ABBAR,18297.0,48.966700,36.933100,728.0
9,ABBAR,99298.0,48.940800,36.933900,624.7
88,BAHABAD,19428.0,56.016700,31.866700,1403.0
89,BAHABAD,99522.0,56.046111,31.841111,1436.0
94,BAJESTAN,18966.0,58.183100,34.516700,1260.0
...,...,...,...,...,...
816,TAKESTAN,99367.0,49.678056,36.052500,1283.4
834,TEST,99999.0,51.309200,35.693100,1191.0
835,TEST,666666.0,47.778900,39.603100,72.6
847,VARZANEH,19336.0,52.616700,32.400000,1748.0


In [22]:
data[data["StationName"] == "TEST"]

Unnamed: 0,StationName,StationID,Date,Tmax,Tmin,Tmean,Precip,DateJalali
3899626,TEST,99999.0,2021-11-28,,7.0,8.2,0.0,1400-09-07
3899627,TEST,99999.0,2021-12-01,19.8,10.7,13.3,0.0,1400-09-10
3899628,TEST,99999.0,2021-12-02,21.8,9.9,11.5,0.0,1400-09-11
3899629,TEST,99999.0,2021-12-03,14.9,4.0,7.5,0.0,1400-09-12
3899630,TEST,99999.0,2021-12-04,16.4,3.5,8.2,0.0,1400-09-13
...,...,...,...,...,...,...,...,...
3899799,TEST,666666.0,2022-06-21,34.5,24.1,30.1,0.0,1401-03-31
3899800,TEST,666666.0,2022-06-22,37.3,25.4,32.3,0.0,1401-04-01
3899801,TEST,666666.0,2022-06-23,35.6,25.7,31.1,0.0,1401-04-02
3899802,TEST,666666.0,2022-06-24,35.6,25.7,31.0,0.0,1401-04-03


In [None]:
for st in list(data["StationName"].unique()):
    df = data[data["StationName"] == st]
    name = f"stations/{st}.csv".replace("*", "_")
    print(name)   
    df.to_csv(name, index=False)