In [1]:
import pandas as pd
import us
import numpy as np
import geopandas as gpd

# Loading the Main Data

In [2]:

#Load Data
earthquake = pd.read_csv('data/Eartquakes-1990-2023.csv')
realtor = pd.read_csv('data/realtor-data.csv')

#Clean up empty spaces
earthquake['state'] = earthquake['state'].str.strip()

#Change USA to the correct state
earthquake['state'] = earthquake['state'].replace('USA', 'Georgia')

# US states 
states = us.states.STATES
List = []
for state in states:
    List.append(state.name)
    List.append(state.abbr)

#Filter for US states 
filtered_earthquake = earthquake[earthquake['state'].isin(List)]
filtered_realtor = realtor[realtor['state'].isin(List)]

#Change all abbreviation to full name (only for earthquake)
    # Dictionary mapping abbreviations to full names
us_states = {state.abbr: state.name for state in states}
filtered_earthquake.loc[:, 'state'] = filtered_earthquake['state'].apply(lambda x: us_states.get(x, x))

#Drop NaN from both dataframe
filtered_earthquake = filtered_earthquake.dropna()
filtered_realtor = filtered_realtor.dropna()

# filtered_earthquake
filtered_realtor

Unnamed: 0,brokered_by,status,price,bed,bath,acre_lot,street,city,state,zip_code,house_size,prev_sold_date
3409,21163.0,for_sale,525000.0,3.0,3.0,0.45,1813270.0,Agawam,Massachusetts,1001.0,2314.0,2014-06-25
3410,67455.0,for_sale,289900.0,3.0,2.0,0.36,1698080.0,Agawam,Massachusetts,1001.0,1276.0,2012-10-12
3416,97400.0,for_sale,384900.0,3.0,2.0,0.46,1244899.0,Agawam,Massachusetts,1001.0,1476.0,1986-11-20
3423,33714.0,for_sale,199999.0,3.0,2.0,1.76,1745924.0,Agawam,Massachusetts,1001.0,1968.0,2008-09-19
3430,22188.0,for_sale,419000.0,4.0,2.0,2.00,1417448.0,Pelham,Massachusetts,1002.0,1607.0,2005-07-25
...,...,...,...,...,...,...,...,...,...,...,...,...
2226377,23009.0,sold,359900.0,4.0,2.0,0.33,353094.0,Richland,Washington,99354.0,3600.0,2022-03-25
2226378,18208.0,sold,350000.0,3.0,2.0,0.10,1062149.0,Richland,Washington,99354.0,1616.0,2022-03-25
2226379,76856.0,sold,440000.0,6.0,3.0,0.50,405677.0,Richland,Washington,99354.0,3200.0,2022-03-24
2226380,53618.0,sold,179900.0,2.0,1.0,0.09,761379.0,Richland,Washington,99354.0,933.0,2022-03-24


# Ajoute les comtés au datasets des trenblements de terre

In [3]:
# Lire ton fichier des villes + comtés
counties = gpd.read_file("data/tl_2021_us_county/tl_2021_us_county.shp")
counties = counties[['GEOID', 'NAME', 'STATEFP', 'COUNTYFP', 'geometry']]
counties = counties.rename(columns={'NAME':'county', 'STATEFP':'state_fips', 'COUNTYFP':'county_fips'})

earthquakes_gdf = gpd.GeoDataFrame(
    filtered_earthquake,
    geometry=gpd.points_from_xy(filtered_earthquake['longitude'], filtered_earthquake['latitude']),
    crs='EPSG:4326'
)

df_with_counties = gpd.sjoin(
    earthquakes_gdf,
    counties[['county_fips','county','state_fips','geometry']],
    how='left',
    predicate='within'
)

filtered_earthquake = df_with_counties

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: EPSG:4269

  df_with_counties = gpd.sjoin(


In [4]:
# -----------------------------
# 1️⃣ Lire le fichier ZIP → county
# -----------------------------
zip_county = pd.read_csv("data/zcta_county_rel_10.txt", dtype=str)

# Garder seulement les colonnes utiles
zip_county = zip_county[['ZCTA5','STATE','COUNTY','GEOID']]

# Renommer les colonnes pour la cohérence
zip_county = zip_county.rename(columns={
    'ZCTA5':'zip',
    'STATE':'state_fips',
    'COUNTY':'county_fips',        # code FIPS du comté
    'GEOID':'full_county_fips'     # FIPS complet state + county
})

# Nettoyer les ZIP : s'assurer que ce sont des chaînes de 5 chiffres
zip_county['zip'] = zip_county['zip'].apply(lambda x: str(int(float(x))).zfill(5))

# -----------------------------
# 2️⃣ Nettoyer les ZIP dans filtered_realtor
# -----------------------------
filtered_realtor = filtered_realtor.rename(columns={'zip_code':'zip'})  # si ta colonne s'appelle zip_code
filtered_realtor['zip'] = filtered_realtor['zip'].apply(lambda x: str(int(float(x))).zfill(5))

# -----------------------------
# 3️⃣ Supprimer les colonnes précédentes si elles existent
# -----------------------------
for col in ['county_fips','full_county_fips','state_fips']:
    if col in filtered_realtor.columns:
        filtered_realtor = filtered_realtor.drop(columns=[col])

# -----------------------------
# 4️⃣ Garder le comté majoritaire par ZIP (premier du groupe)
# -----------------------------
zip_major = zip_county.groupby('zip').first().reset_index()

# -----------------------------
# 5️⃣ Merge pour assigner county_fips, full_county_fips, state_fips
# -----------------------------
filtered_realtor = filtered_realtor.merge(
    zip_major[['zip','county_fips','full_county_fips','state_fips']],
    on='zip',
    how='left'
)

# -----------------------------
# 6️⃣ Vérifier
# -----------------------------
print(filtered_realtor[['zip','county_fips','full_county_fips','state_fips']].head())

     zip county_fips full_county_fips state_fips
0  01001         013            25013         25
1  01001         013            25013         25
2  01001         013            25013         25
3  01001         013            25013         25
4  01002         011            25011         25


## Ajoute un date correct (JJ-MM-AAAA) au dataset des trembelements de terre

In [5]:
# 1. Get year, month, day
filtered_earthquake[['year', 'month', 'day']] = (
    filtered_earthquake['date']
    .str.split('-', expand=True)
)

# 2. Fix day (remove time)
filtered_earthquake['day'] = (
    filtered_earthquake['day']
    .str.split(' ', expand=True)[0]
)

# 3. Convert to int (safe)
filtered_earthquake[['year', 'month', 'day']] = (
    filtered_earthquake[['year', 'month', 'day']]
    .apply(pd.to_numeric, errors='coerce')
)

# 4. Drop rows with NaN in date parts
filtered_earthquake = filtered_earthquake.dropna(subset=['year', 'month', 'day'])

# 5. Convert to int after NaN removal
filtered_earthquake[['year', 'month', 'day']] = (
    filtered_earthquake[['year', 'month', 'day']].astype(int)
)

# 6. Drop old date column
filtered_earthquake = filtered_earthquake.drop(columns=['date'])

In [6]:
# Convert prev_sold_date to datetime
filtered_realtor["prev_sold_date"] = pd.to_datetime(
    filtered_realtor["prev_sold_date"], errors="coerce"
)

# Extract year / month / day
filtered_realtor["sold_year"] = filtered_realtor["prev_sold_date"].dt.year
filtered_realtor["sold_month"] = filtered_realtor["prev_sold_date"].dt.month
filtered_realtor["sold_day"] = filtered_realtor["prev_sold_date"].dt.day

# Sauvegarder les données

In [7]:
# Save the cleaned versions without overwriting originals
filtered_earthquake.to_csv('data/earthquake_cleaned.csv', index=False)
filtered_realtor.to_csv('data/realtor_cleaned.csv', index=False)

print("✔ Files saved: earthquake_cleaned.csv, realtor_cleaned.csv")

filtered_realtor

✔ Files saved: earthquake_cleaned.csv, realtor_cleaned.csv


Unnamed: 0,brokered_by,status,price,bed,bath,acre_lot,street,city,state,zip,house_size,prev_sold_date,county_fips,full_county_fips,state_fips,sold_year,sold_month,sold_day
0,21163.0,for_sale,525000.0,3.0,3.0,0.45,1813270.0,Agawam,Massachusetts,01001,2314.0,2014-06-25,013,25013,25,2014,6,25
1,67455.0,for_sale,289900.0,3.0,2.0,0.36,1698080.0,Agawam,Massachusetts,01001,1276.0,2012-10-12,013,25013,25,2012,10,12
2,97400.0,for_sale,384900.0,3.0,2.0,0.46,1244899.0,Agawam,Massachusetts,01001,1476.0,1986-11-20,013,25013,25,1986,11,20
3,33714.0,for_sale,199999.0,3.0,2.0,1.76,1745924.0,Agawam,Massachusetts,01001,1968.0,2008-09-19,013,25013,25,2008,9,19
4,22188.0,for_sale,419000.0,4.0,2.0,2.00,1417448.0,Pelham,Massachusetts,01002,1607.0,2005-07-25,011,25011,25,2005,7,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1082132,23009.0,sold,359900.0,4.0,2.0,0.33,353094.0,Richland,Washington,99354,3600.0,2022-03-25,005,53005,53,2022,3,25
1082133,18208.0,sold,350000.0,3.0,2.0,0.10,1062149.0,Richland,Washington,99354,1616.0,2022-03-25,005,53005,53,2022,3,25
1082134,76856.0,sold,440000.0,6.0,3.0,0.50,405677.0,Richland,Washington,99354,3200.0,2022-03-24,005,53005,53,2022,3,24
1082135,53618.0,sold,179900.0,2.0,1.0,0.09,761379.0,Richland,Washington,99354,933.0,2022-03-24,005,53005,53,2022,3,24


# Début de l'aggrégation

In [9]:

### COUNTY + STATE + YEAR AGG ###
# --- Earthquake aggregation by state + county + year ---
agg_eq_state_county_year = (
    filtered_earthquake
    .groupby(['state', 'county_fips', 'year'])
    .agg(
        n_earthquakes=('magnitudo', 'count'),
        avg_magnitude=('magnitudo', 'mean'),
        max_magnitude=('magnitudo', 'max'),
        avg_depth=('depth', 'mean'),
    )
    .reset_index()
)

# --- Realtor aggregation by state + county + year ---
agg_re_state_county_year = (
    filtered_realtor
    .groupby(['state', 'county_fips', 'sold_year'])
    .agg(
        n_properties=('price', 'count'),
        avg_price=('price', 'mean'),
        median_price=('price', 'median'),
        avg_bedrooms=('bed', 'mean'),
        avg_bathrooms=('bath', 'mean')
    )
    .reset_index()
    .rename(columns={'sold_year': 'year'})
)

### STATE + YEAR AGG ###
# --- Earthquake aggregation by state + year ---
agg_eq_state_year = (
    filtered_earthquake
    .groupby(['state', 'year'])
    .agg(
        n_earthquakes=('magnitudo', 'count'),
        avg_magnitude=('magnitudo', 'mean'),
        max_magnitude=('magnitudo', 'max'),
        avg_depth=('depth', 'mean')
    )
    .reset_index()
)

# --- Realtor aggregation by state + year ---
agg_re_state_year = (
    filtered_realtor
    .groupby(['state', 'sold_year'])
    .agg(
        n_properties=('price', 'count'),
        avg_price=('price', 'mean'),
        median_price=('price', 'median'),
        avg_bedrooms=('bed', 'mean'),
        avg_bathrooms=('bath', 'mean')
    )
    .reset_index()
    .rename(columns={'sold_year': 'year'})
)

## Aggrégation

In [11]:
agg_county_year = agg_eq_state_county_year.merge(
    agg_re_state_county_year,
    on=['state', 'county_fips', 'year'],
    how='outer'
)

agg_state_year = agg_eq_state_year.merge(
    agg_re_state_year,
    on=['state', 'year'],
    how='outer'
)

## Géneration code FIPS pour les états

In [13]:
# ---------------------------
# FILTER YEARS >= 1990
# ---------------------------

agg_state_year = agg_state_year[agg_state_year["year"] >= 1990].reset_index(drop=True)
agg_county_year = agg_county_year[agg_county_year["year"] >= 1990].reset_index(drop=True)

# ---------------------------
# FIPS CODES (STATE LEVEL)
# ---------------------------

def get_fips(state_name):
    """Return 2-digit FIPS code or None if invalid."""
    st = us.states.lookup(state_name)
    return st.fips if st else None

agg_state_year["fips"] = agg_state_year["state"].apply(get_fips)
agg_county_year["fips"] = agg_county_year["state"].apply(get_fips)

# Convert FIPS to zero-padded string (Altair requires strings)
agg_state_year["fips"] = agg_state_year["fips"].astype(str).str.zfill(2)
agg_county_year["fips"] = agg_county_year["fips"].astype(str).str.zfill(2)

# Vérifier
print(agg_county_year[['state', 'county_fips']].head())

# ---------------------------
# CLEAN MISSING VALUES
# ---------------------------
agg_state_year = agg_state_year.replace(r'^\s*$', np.nan, regex=True)
agg_county_year = agg_county_year.replace(r'^\s*$', np.nan, regex=True)


agg_county_year

     state county_fips
0  Alabama         001
1  Alabama         001
2  Alabama         001
3  Alabama         001
4  Alabama         001


Unnamed: 0,state,county_fips,year,n_earthquakes,avg_magnitude,max_magnitude,avg_depth,n_properties,avg_price,median_price,avg_bedrooms,avg_bathrooms,fips
0,Alabama,001,1999,,,,,1.0,324900.0,324900.0,3.00,3.0,01
1,Alabama,001,2000,,,,,1.0,149900.0,149900.0,4.00,4.0,01
2,Alabama,001,2001,,,,,1.0,260000.0,260000.0,4.00,3.0,01
3,Alabama,001,2002,,,,,2.0,324500.0,324500.0,4.50,3.5,01
4,Alabama,001,2003,,,,,4.0,222500.0,212500.0,3.75,2.0,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
49959,Wyoming,081,2016,1.0,0.5700,0.57,3.8200,,,,,,56
49960,Wyoming,081,2018,4.0,0.7925,1.45,2.1875,,,,,,56
49961,Wyoming,107,1999,1.0,2.6000,2.60,0.0000,,,,,,56
49962,Wyoming,107,2009,1.0,3.2000,3.20,5.0000,,,,,,56


In [14]:
import pandas as pd
import numpy as np
from addfips import AddFIPS

# ---------------------------
# FILTER YEARS >= 1990
# ---------------------------
agg_state_year = agg_state_year[agg_state_year["year"] >= 1990].reset_index(drop=True)
agg_county_year = agg_county_year[agg_county_year["year"] >= 1990].reset_index(drop=True)

# ---------------------------
# INITIALISER addfips
# ---------------------------
af = AddFIPS()

# ---------------------------
# FIPS CODES (STATE LEVEL)
# ---------------------------
# On peut juste passer le nom de l'état pour obtenir le FIPS
agg_state_year['fips'] = agg_state_year['state'].apply(af.get_state_fips)
agg_state_year['fips'] = pd.to_numeric(agg_state_year['fips'], errors='coerce').astype('Int64')

# ---------------------------
# FIPS CODES (COUNTY LEVEL)
# ---------------------------
# Appliquer get_county_fips ligne par ligne
agg_county_year['county_fips'] = agg_county_year.apply(
    lambda row: af.get_county_fips(row['county'], row['state']),
    axis=1
)
agg_county_year['county_fips'] = pd.to_numeric(agg_county_year['county_fips'], errors='coerce').astype('Int64')

# ---------------------------
# CLEAN MISSING VALUES
# ---------------------------
agg_state_year = agg_state_year.replace(r'^\s*$', np.nan, regex=True)
agg_county_year = agg_county_year.replace(r'^\s*$', np.nan, regex=True)

# Vérification rapide
print(agg_state_year[['state', 'fips']].head())
print(agg_county_year[['state', 'county', 'county_fips']].head())

KeyError: 'county'

# Sauvegarde des deux aggrégations

In [15]:
agg_county_year.to_csv("data/agg_county_year.csv", index=False)
print("✔ Saved county + year aggregation")


agg_state_year.to_csv("data/agg_state_year.csv", index=False)
print("✔ Saved improved aggregation → data/agg_state_year.csv")

✔ Saved county + year aggregation
✔ Saved improved aggregation → data/agg_state_year.csv
