In [1]:
import os
import glob 
import re

import earthpy as et
import geojson
import pandas as pd
import geopandas as gpd

In [2]:
pwd

'/Users/robynmarowitz/projects/tempo-site/data/textile-source'

In [3]:
os.chdir("../monthly")
# print(glob.glob('*'))

In [4]:
file_list = glob.glob("*.geojson")
file_list

['HAQ_TEMPO_NO2_CONUS_QA75_L3_Monthly_022024_15Z_V3.geojson',
 'HAQ_TEMPO_NO2_CONUS_QA75_L3_Monthly_062024_01Z_V3.geojson',
 'HAQ_TEMPO_NO2_CONUS_QA75_L3_Monthly_082023_11Z_V3.geojson',
 'HAQ_TEMPO_NO2_CONUS_QA75_L3_Monthly_082024_18Z_V3.geojson',
 'HAQ_TEMPO_NO2_CONUS_QA75_L3_Monthly_052024_18Z_V3.geojson',
 'HAQ_TEMPO_NO2_CONUS_QA75_L3_Monthly_072024_13Z_V3.geojson',
 'HAQ_TEMPO_NO2_CONUS_QA75_L3_Monthly_112023_19Z_V3.geojson',
 'HAQ_TEMPO_NO2_CONUS_QA75_L3_Monthly_012024_22Z_V3.geojson',
 'HAQ_TEMPO_NO2_CONUS_QA75_L3_Monthly_042024_16Z_V3.geojson',
 'HAQ_TEMPO_NO2_CONUS_QA75_L3_Monthly_082023_23Z_V3.geojson',
 'HAQ_TEMPO_NO2_CONUS_QA75_L3_Monthly_102023_17Z_V3.geojson',
 'HAQ_TEMPO_NO2_CONUS_QA75_L3_Monthly_072024_21Z_V3.geojson',
 'HAQ_TEMPO_NO2_CONUS_QA75_L3_Monthly_052024_11Z_V3.geojson',
 'HAQ_TEMPO_NO2_CONUS_QA75_L3_Monthly_082024_11Z_V3.geojson',
 'HAQ_TEMPO_NO2_CONUS_QA75_L3_Monthly_082023_18Z_V3.geojson',
 'HAQ_TEMPO_NO2_CONUS_QA75_L3_Monthly_032024_20Z_V3.geojson',
 'HAQ_TE

In [5]:
def extract_date_from_filename(filename):
    # Use regular expression to capture the date between 'Monthly_' and '_V3'
    match = re.search(r'Monthly_(\d{6})', filename)
    if match:
        return match.group(1)
    else:
        return None  # In case the date is not found

In [6]:
def extract_time_from_filename(filename):
    match = re.search(r'(\d{2}Z)', filename)
    if match:
        return match.group(1)
    else:
        return None  # In case the time is not found

In [None]:
gdfs = []
for file in file_list:
    # print(file)
    gdf = gpd.read_file(file)
    # Extract the date from the filename
    date_str = extract_date_from_filename(file)
    time_str = extract_time_from_filename(file)
    gdf['date'] = pd.to_datetime(date_str, format='%m%Y')
    gdf['time'] = time_str
    gdfs.append(gdf)

gdfs_cleaned = [gdf.dropna(axis=1, how='all') for gdf in gdfs]

In [None]:
monthly_gdf = pd.concat(gdfs_cleaned, ignore_index=True)

In [None]:
# I only want fips 8013 and and 8031
fips_to_keep = ["8013", "8031"] 
monthly_gdf["FIPS_new"] = monthly_gdf["FIPS_new"].astype(str)
filtered_gdf = monthly_gdf[monthly_gdf["FIPS_new"].isin(fips_to_keep)]

In [None]:
census_gdf = gpd.read_file('../preprocess/Colorado_Census_Tract_Boundaries.geojson')
census_gdf

In [None]:
df = pd.read_csv('../preprocess/state_and_county_fips_master.csv')
df

In [None]:
co_df = df[df['state']=='CO'] # Filter to only Colorado
# Create new column with County FIPS from tract fips
census_gdf['FIPS_new'] = census_gdf['FIPS'].str[:5].str.lstrip('0').astype(int)
census_gdf

In [None]:
filtered_gdf = filtered_gdf.to_crs(census_gdf.crs)
filtered_gdf

census_gdf['FIPS_new'] = census_gdf['FIPS_new'].astype('int64')
filtered_gdf['FIPS_new'] = filtered_gdf['FIPS_new'].astype('int64')

In [None]:
joined_gdf_1 = gpd.sjoin(filtered_gdf, census_gdf, how='inner', predicate='intersects')  # or use 'within', 'contains', etc.
joined_gdf_1

In [None]:
joined_gdf = census_gdf.merge(filtered_gdf, on='FIPS_new', how='inner')  # Change 'inner' to 'left', 'right', or 'outer' if needed

joined_gdf.head()