In [2]:
import pandas as pd
import csv
import os
import polars as pl # For reading the CSV efficiently

# -----------------------------------------------------------------
# 1) Read the county-to-climdivs mapping file into a lookup dict
#    The file has 3 columns (POSTAL_FIPS_ID, NCDC_FIPS_ID, CLIMDIV_ID)
#    We'll map NCDC_FIPS_ID -> (POSTAL_FIPS_ID, CLIMDIV_ID)
# -----------------------------------------------------------------



path_raw = "../../_noaa_climdiv_local/"

mapping = {}
with open(os.path.join(path_raw, "county-to-climdivs.txt"), "r") as f:
    next(f)  # skip header line if it exists
    for line in f:
        parts = line.strip().split()
        if len(parts) != 3:
            continue
        postal_fips, ncdc_fips, climdiv_id = parts
        mapping[ncdc_fips] = (postal_fips, climdiv_id)

# -----------------------------------------------------------------
#2) Define a helper function to parse each line in tmaxcy/pcpncy
# -----------------------------------------------------------------
def parse_clim_line(line):
    """
    Given a line (string) from tmaxcy or pcpncy,
    returns a dict with raw_code, state, county, division, year, and the 12 monthly values.
    If the line can't be mapped (NCDC FIPS not found), return None.
    """
    parts = line.strip().split()
    if len(parts) < 13:
        return None  # not enough data
    
    # The first item is the 11-digit code: e.g. "01001271895"
    code = parts[0]
    monthly_values = parts[1:]  # the next 12 numbers
    
    ncdc_fips = code[:5]       #first 5 digits
    data_type = code[5:7]      #next 2 digits (27 for tmax, 01 for pcpn)
    year = code[7:]            #last 4 digits
    
    if ncdc_fips not in mapping:
        return None
    
    postal_fips, climdiv_id = mapping[ncdc_fips]
    # postal_fips e.g. "04001" => correct_state="04", correct_county="001"
    correct_state = postal_fips[:2]
    correct_county = postal_fips[2:]
    # climdiv_id e.g. "0202" => last two digits "02" for division
    division = climdiv_id[-2:]
    
    # Create a dict with the data, including the raw_code for clarity
    return {
        "raw_code": code,
        "state": correct_state,
        "county": correct_county,
        "division": division,
        "year": year,
        "Jan": monthly_values[0],
        "Feb": monthly_values[1],
        "Mar": monthly_values[2],
        "Apr": monthly_values[3],
        "May": monthly_values[4],
        "Jun": monthly_values[5],
        "Jul": monthly_values[6],
        "Aug": monthly_values[7],
        "Sep": monthly_values[8],
        "Oct": monthly_values[9],
        "Nov": monthly_values[10],
        "Dec": monthly_values[11]
    }

# -----------------------------------------------------------------
#3) Read & parse tmaxcy (temperature) lines
# -----------------------------------------------------------------
tmax_records = []
with open(os.path.join(path_raw, "climdiv-tmaxcy-v1.0.0-20250506"), "r") as f:
    for line in f:
        parsed = parse_clim_line(line)
        if parsed:
            tmax_records.append(parsed)

df_tmax = pd.DataFrame(tmax_records)
print("TMAX DataFrame:\n", df_tmax.head())

# -----------------------------------------------------------------
#4) Lastly Read & parse pcpncy (precipitation) lines
# -----------------------------------------------------------------
pcpn_records = []
with open(os.path.join(path_raw, "climdiv-pcpncy-v1.0.0-20250506"), "r") as f:
    for line in f:
        parsed = parse_clim_line(line)
        if parsed:
            pcpn_records.append(parsed)

df_pcpn = pd.DataFrame(pcpn_records)
print("PCPN DataFrame:\n", df_pcpn.head())

TMAX DataFrame:
       raw_code state county division  year    Jan    Feb    Mar    Apr    May  \
0  01001271895    01    001       03  1895  53.70  48.70  67.60  76.40  81.90   
1  01001271896    01    001       03  1896  54.20  60.80  65.30  81.60  88.50   
2  01001271897    01    001       03  1897  54.20  63.10  71.40  75.10  83.20   
3  01001271898    01    001       03  1898  60.60  59.10  71.00  72.00  89.50   
4  01001271899    01    001       03  1899  55.60  53.40  68.80  73.40  89.30   

     Jun    Jul    Aug    Sep    Oct    Nov    Dec  
0  89.20  91.10  90.40  90.90  76.00  66.60  58.00  
1  88.20  92.00  94.50  90.80  77.20  69.90  58.70  
2  95.60  93.30  89.90  88.90  81.30  68.10  58.80  
3  93.90  91.50  88.80  86.70  73.60  61.70  55.70  
4  93.70  92.20  92.60  87.50  78.40  68.10  56.60  
PCPN DataFrame:
       raw_code state county division  year   Jan   Feb    Mar   Apr   May  \
0  01001011895    01    001       03  1895  7.03  2.96   8.36  3.53  3.96   
1  0100

In [3]:
# Convert the temperature columns of df_tmax from Fahrenheit to Celsius
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

for month in months:
    df_tmax[month] = (df_tmax[month].astype(float) - 32) * 5 / 9

df_tmax.head()

Unnamed: 0,raw_code,state,county,division,year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,1001271895,1,1,3,1895,12.055556,9.277778,19.777778,24.666667,27.722222,31.777778,32.833333,32.444444,32.722222,24.444444,19.222222,14.444444
1,1001271896,1,1,3,1896,12.333333,16.0,18.5,27.555556,31.388889,31.222222,33.333333,34.722222,32.666667,25.111111,21.055556,14.833333
2,1001271897,1,1,3,1897,12.333333,17.277778,21.888889,23.944444,28.444444,35.333333,34.055556,32.166667,31.611111,27.388889,20.055556,14.888889
3,1001271898,1,1,3,1898,15.888889,15.055556,21.666667,22.222222,31.944444,34.388889,33.055556,31.555556,30.388889,23.111111,16.5,13.166667
4,1001271899,1,1,3,1899,13.111111,11.888889,20.444444,23.0,31.833333,34.277778,33.444444,33.666667,30.833333,25.777778,20.055556,13.666667


In [4]:
# Filter df_tmax for state '17', county '003', and year between 2013 and 2022.
df_tmax_filtered = df_tmax[
    (df_tmax['state'] == '17') &
    (df_tmax['county'] == '003') &
    (df_tmax['year'].astype(int).between(2013, 2022))
]
df_tmax_filtered

Unnamed: 0,raw_code,state,county,division,year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
73609,11003272013,17,3,8,2013,7.444444,8.333333,9.944444,19.333333,25.166667,29.944444,29.555556,29.666667,28.444444,20.888889,12.222222,6.611111
73610,11003272014,17,3,8,2014,3.777778,4.944444,11.833333,21.0,25.888889,29.5,29.388889,31.222222,27.0,21.777778,9.888889,7.611111
73611,11003272015,17,3,8,2015,5.833333,3.222222,12.388889,21.777778,25.5,29.944444,31.277778,30.055556,28.555556,22.055556,16.166667,13.444444
73612,11003272016,17,3,8,2016,5.722222,9.888889,17.388889,21.944444,24.444444,31.722222,31.777778,30.833333,29.444444,24.555556,17.166667,7.722222
73613,11003272017,17,3,8,2017,9.055556,14.388889,15.222222,22.611111,25.388889,30.0,32.0,29.333333,28.888889,22.611111,14.777778,7.833333
73614,11003272018,17,3,8,2018,3.444444,10.555556,13.333333,17.055556,29.444444,32.055556,32.111111,30.444444,28.777778,21.666667,10.333333,9.666667
73615,11003272019,17,3,8,2019,6.222222,10.222222,12.333333,20.777778,26.0,29.111111,31.444444,31.333333,31.722222,21.166667,10.888889,11.055556
73616,11003272020,17,3,8,2020,8.0,8.944444,16.5,19.166667,23.666667,30.388889,32.777778,29.444444,26.555556,20.611111,16.888889,9.0
73617,11003272021,17,3,8,2021,6.611111,2.888889,17.444444,20.166667,24.444444,30.5,31.055556,31.111111,28.555556,23.444444,13.833333,14.833333
73618,11003272022,17,3,8,2022,5.666667,8.166667,15.833333,18.888889,26.333333,31.722222,33.055556,31.111111,28.777778,22.388889,14.166667,7.777778
