In [1]:
import pandas as pd
import us

In [2]:
#Load Data
earthquake = pd.read_csv('data/Eartquakes-1990-2023.csv')
realtor = pd.read_csv('data/realtor-data.csv')

#Clean up empty spaces
earthquake['state'] = earthquake['state'].str.strip()

#Change USA to the correct state
earthquake['state'] = earthquake['state'].replace('USA', 'Georgia')

# US states 
states = us.states.STATES
List = []
for state in states:
    List.append(state.name)
    List.append(state.abbr)

#Filter for US states 
filtered_earthquake = earthquake[earthquake['state'].isin(List)]
filtered_realtor = realtor[realtor['state'].isin(List)]

#Change all abbreviation to full name (only for earthquake)
    # Dictionary mapping abbreviations to full names
us_states = {state.abbr: state.name for state in states}
filtered_earthquake.loc[:, 'state'] = filtered_earthquake['state'].apply(lambda x: us_states.get(x, x))

#Drop NaN from both dataframe
filtered_earthquake = filtered_earthquake.dropna()
filtered_realtor = filtered_realtor.dropna()

filtered_earthquake
# filtered_realtor

Unnamed: 0,time,place,status,tsunami,significance,data_type,magnitudo,state,longitude,latitude,depth,date
0,631153353990,"12 km NNW of Meadow Lakes, Alaska",reviewed,0,96,earthquake,2.50,Alaska,-149.669200,61.730200,30.100,1990-01-01 00:22:33.990000+00:00
1,631153491210,"14 km S of Volcano, Hawaii",reviewed,0,31,earthquake,1.41,Hawaii,-155.212333,19.317667,6.585,1990-01-01 00:24:51.210000+00:00
2,631154083450,"7 km W of Cobb, California",reviewed,0,19,earthquake,1.11,California,-122.806167,38.821000,3.220,1990-01-01 00:34:43.450000+00:00
3,631155512130,"11 km E of Mammoth Lakes, California",reviewed,0,15,earthquake,0.98,California,-118.846333,37.664333,-0.584,1990-01-01 00:58:32.130000+00:00
4,631155824490,"16km N of Fillmore, CA",reviewed,0,134,earthquake,2.95,California,-118.934000,34.546000,16.122,1990-01-01 01:03:44.490000+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...
3445744,1690626699102,"87 km NNW of Karluk, Alaska",automatic,0,15,earthquake,1.00,Alaska,-155.204500,58.241300,0.000,2023-07-29 10:31:39.102000+00:00
3445745,1690626815980,"0 km SW of Universal City, CA",automatic,0,16,earthquake,1.03,California,-118.356833,34.135500,15.710,2023-07-29 10:33:35.980000+00:00
3445747,1690626975715,"Kodiak Island region, Alaska",automatic,0,44,earthquake,1.70,Alaska,-153.729900,57.790100,24.400,2023-07-29 10:36:15.715000+00:00
3445749,1690628146040,"7 km W of Cobb, CA",automatic,0,16,earthquake,1.03,California,-122.800499,38.827499,1.720,2023-07-29 10:55:46.040000+00:00


In [3]:
import geopandas as gpd
import pandas as pd

# Chargement des comtés US (shapefile local)
counties = gpd.read_file("data/cb_2018_us_county_500k.geojson")

In [4]:
# Convertir en GeoDataFrame
gdf_eq = gpd.GeoDataFrame(
    filtered_earthquake,
    geometry=gpd.points_from_xy(filtered_earthquake.longitude, filtered_earthquake.latitude),
    crs=counties.crs
)

# Jointure spatiale pour récupérer le county
gdf_eq_with_county = gpd.sjoin(gdf_eq, counties, how="left", predicate="within")

# Ajouter la colonne county dans le DataFrame original
filtered_earthquake["county"] = gdf_eq_with_county["NAME"]

In [None]:
# 1. Convert timestamp (ms → seconds → datetime)

filtered_earthquake["datetime"]

# 2. Extract fields
filtered_earthquake["year"]  = filtered_earthquake["datetime"].dt.year
filtered_earthquake["month"] = filtered_earthquake["datetime"].dt.month
filtered_earthquake["day"]   = filtered_earthquake["datetime"].dt.day

# 3. Drop the old date column
if "date" in filtered_earthquake.columns:
    filtered_earthquake = filtered_earthquake.drop(columns=["date"])
    
filtered_earthquake

Unnamed: 0,time,place,status,tsunami,significance,data_type,magnitudo,state,longitude,latitude,depth,datetime,year,month,day
0,631153353990,"12 km NNW of Meadow Lakes, Alaska",reviewed,0,96,earthquake,2.50,Alaska,-149.669200,61.730200,30.100,1970-01-01 00:00:00.631153353,1970,1,1
1,631153491210,"14 km S of Volcano, Hawaii",reviewed,0,31,earthquake,1.41,Hawaii,-155.212333,19.317667,6.585,1970-01-01 00:00:00.631153491,1970,1,1
2,631154083450,"7 km W of Cobb, California",reviewed,0,19,earthquake,1.11,California,-122.806167,38.821000,3.220,1970-01-01 00:00:00.631154083,1970,1,1
3,631155512130,"11 km E of Mammoth Lakes, California",reviewed,0,15,earthquake,0.98,California,-118.846333,37.664333,-0.584,1970-01-01 00:00:00.631155512,1970,1,1
4,631155824490,"16km N of Fillmore, CA",reviewed,0,134,earthquake,2.95,California,-118.934000,34.546000,16.122,1970-01-01 00:00:00.631155824,1970,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3445744,1690626699102,"87 km NNW of Karluk, Alaska",automatic,0,15,earthquake,1.00,Alaska,-155.204500,58.241300,0.000,1970-01-01 00:00:01.690626699,1970,1,1
3445745,1690626815980,"0 km SW of Universal City, CA",automatic,0,16,earthquake,1.03,California,-118.356833,34.135500,15.710,1970-01-01 00:00:01.690626815,1970,1,1
3445747,1690626975715,"Kodiak Island region, Alaska",automatic,0,44,earthquake,1.70,Alaska,-153.729900,57.790100,24.400,1970-01-01 00:00:01.690626975,1970,1,1
3445749,1690628146040,"7 km W of Cobb, CA",automatic,0,16,earthquake,1.03,California,-122.800499,38.827499,1.720,1970-01-01 00:00:01.690628146,1970,1,1


In [23]:
import pandas as pd

# Charger ton CSV avec les villes et comtés
city_county = pd.read_csv("data/city_county.csv")

# Vérifie les colonnes pour être sûr
# print(city_county.head())

# Faire la jointure sur la colonne city
# Assure-toi que les noms de colonnes correspondent (ici "CITY" dans city_county et "city" dans filtered_realtor)
filtered_realtor = filtered_realtor.merge(
    city_county[['CITY','COUNTY']],  # colonnes à joindre
    left_on='city',                  # colonne dans filtered_realtor
    right_on='CITY',                 # colonne dans city_county
    how='left'
)

# Renommer la colonne COUNTy si nécessaire
filtered_realtor = filtered_realtor.rename(columns={'COUNTY':'county'})

# Supprimer la colonne CITY doublon créée par le merge
filtered_realtor = filtered_realtor.drop(columns=['CITY'])

In [24]:
# Convert prev_sold_date to datetime
filtered_realtor["prev_sold_date"] = pd.to_datetime(
    filtered_realtor["prev_sold_date"], errors="coerce"
)

# Extract year / month / day
filtered_realtor["sold_year"] = filtered_realtor["prev_sold_date"].dt.year
filtered_realtor["sold_month"] = filtered_realtor["prev_sold_date"].dt.month
filtered_realtor["sold_day"] = filtered_realtor["prev_sold_date"].dt.day

In [25]:
# Save the cleaned versions without overwriting originals
filtered_earthquake.to_csv('data/earthquake_cleaned.csv', index=False)
filtered_realtor.to_csv('data/realtor_cleaned.csv', index=False)

print("✔ Files saved: earthquake_cleaned.csv, realtor_cleaned.csv")

✔ Files saved: earthquake_cleaned.csv, realtor_cleaned.csv


In [26]:
print(filtered_earthquake, filtered_realtor)

                  time                                 place     status  \
0         631153353990     12 km NNW of Meadow Lakes, Alaska   reviewed   
1         631153491210            14 km S of Volcano, Hawaii   reviewed   
2         631154083450            7 km W of Cobb, California   reviewed   
3         631155512130  11 km E of Mammoth Lakes, California   reviewed   
4         631155824490                16km N of Fillmore, CA   reviewed   
...                ...                                   ...        ...   
3445744  1690626699102           87 km NNW of Karluk, Alaska  automatic   
3445745  1690626815980         0 km SW of Universal City, CA  automatic   
3445747  1690626975715          Kodiak Island region, Alaska  automatic   
3445749  1690628146040                    7 km W of Cobb, CA  automatic   
3445750  1690628937884             35 km W of Karluk, Alaska  automatic   

         tsunami  significance   data_type  magnitudo       state   longitude  \
0              0  

In [27]:
# --- Earthquake aggregation by county + year ---
agg_eq_county_year = (
    filtered_earthquake
    .groupby(['state', 'county', 'year'])
    .agg(
        n_earthquakes=('magnitudo', 'count'),
        avg_magnitude=('magnitudo', 'mean'),
        max_magnitude=('magnitudo', 'max'),
        avg_depth=('depth', 'mean')
    )
    .reset_index()
)

# --- Realtor aggregation by county + year ---
agg_re_county_year = (
    filtered_realtor
    .groupby(['state', 'county', 'sold_year'])
    .agg(
        n_properties=('price', 'count'),
        avg_price=('price', 'mean'),
        median_price=('price', 'median'),
        avg_bedrooms=('bed', 'mean'),
        avg_bathrooms=('bath', 'mean')
    )
    .reset_index()
    .rename(columns={'sold_year': 'year'})
)

# --- Combine both ---
agg_county_year = agg_eq_county_year.merge(
    agg_re_county_year,
    on=['county', 'year'],
    how='outer'
)

agg_county_year.to_csv("data/agg_county_year.csv", index=False)
print("✔ Saved county + year aggregation")
agg_county_year.head()

KeyError: 'county'

In [20]:
import pandas as pd

# ---------------------------
# 1. CLEAN + PREPARE DATA
# ---------------------------

eq = filtered_earthquake.copy()
re = filtered_realtor.copy()

# Extract year from earthquake timestamps
eq["date"] = pd.to_datetime(eq["date"], errors="coerce")
eq["year"] = eq["date"].dt.year

# Realtor dataset may not have a meaningful year → optional
# re["year"] = pd.to_datetime(re["prev_sold_date"], errors="coerce").dt.year


# ---------------------------
# 2. EARTHQUAKE AGGREGATION (STATE)
# ---------------------------

agg_eq_state = (
    eq.groupby("state")
      .agg(
          n_earthquakes=("magnitudo", "count"),
          avg_magnitude=("magnitudo", "mean"),
          max_magnitude=("magnitudo", "max"),
          avg_depth=("depth", "mean"),
          first_quake=("year", "min"),
          last_quake=("year", "max"),
      )
      .reset_index()
)

# Add an earthquake intensity score (useful for correlation later)
agg_eq_state["intensity_score"] = (
    agg_eq_state["avg_magnitude"] * 0.6 +
    agg_eq_state["max_magnitude"] * 0.4
)


# ---------------------------
# 3. REALTOR AGGREGATION (STATE)
# ---------------------------

agg_re_state = (
    re.groupby("state")
      .agg(
          n_properties=("price", "count"),
          avg_price=("price", "mean"),
          median_price=("price", "median"),
          avg_bedrooms=("bed", "mean"),
          avg_bathrooms=("bath", "mean"),
      )
      .reset_index()
)

# Additional useful real-estate analytics
agg_re_state["price_per_bedroom"] = agg_re_state["avg_price"] / agg_re_state["avg_bedrooms"]
agg_re_state["price_per_bathroom"] = agg_re_state["avg_price"] / agg_re_state["avg_bathrooms"]


# ---------------------------
# 4. MERGE BOTH AGGREGATIONS
# ---------------------------

agg_combined_state = agg_eq_state.merge(
    agg_re_state, on="state", how="outer"
)

# Earthquake/property density ratio
agg_combined_state["eq_per_100_properties"] = (
    agg_combined_state["n_earthquakes"] /
    (agg_combined_state["n_properties"] / 100).replace({0: None})
)

# Sort by intensity or by price
agg_combined_state = agg_combined_state.sort_values(
    by="intensity_score", ascending=False
)

print("✔ Improved Aggregation by state completed:")
print(agg_combined_state.head())


# ---------------------------
# 5. SAVE
# ---------------------------

agg_combined_state.to_csv("data/state_aggregation.csv", index=False)
print("✔ Saved improved aggregation → data/state_aggregation.csv")

KeyError: 'date'

In [None]:
# ---------------------------
# 1. CLEAN + PREPARE DATA
# ---------------------------

eq = filtered_earthquake.copy()
re = filtered_realtor.copy()

# Convert timestamps → datetime
eq["datetime"] = pd.to_datetime(eq["time"], unit="ms", errors="coerce")
eq["year"] = eq["datetime"].dt.year

# Realtor sold date → datetime
re["sold_date"] = pd.to_datetime(re["prev_sold_date"], errors="coerce")
re["sold_year"] = re["sold_date"].dt.year


# ======================================================
# ==========     EARTHQUAKE AGGREGATIONS     ===========
# ======================================================

# ---------- EQ by county + year ----------
agg_eq_county_year = (
    eq.groupby(["county", "year"])
      .agg(
          n_earthquakes=("magnitudo", "count"),
          avg_magnitude=("magnitudo", "mean"),
          max_magnitude=("magnitudo", "max"),
          avg_depth=("depth", "mean"),
      )
      .reset_index()
)

# ---------- EQ by state + year ----------
agg_eq_state_year = (
    eq.groupby(["state", "year"])
      .agg(
          n_earthquakes=("magnitudo", "count"),
          avg_magnitude=("magnitudo", "mean"),
          max_magnitude=("magnitudo", "max"),
          avg_depth=("depth", "mean"),
      )
      .reset_index()
)

# Add intensity score useful for ranking
for df in [agg_eq_county_year, agg_eq_state_year]:
    df["intensity_score"] = (
        df["avg_magnitude"] * 0.6 +
        df["max_magnitude"] * 0.4
    )


# ======================================================
# ==========      REALTOR AGGREGATIONS      ============
# ======================================================

# ---------- RE by county + year ----------
agg_re_county_year = (
    re.groupby(["county", "sold_year"])
      .agg(
          n_properties=("price", "count"),
          avg_price=("price", "mean"),
          median_price=("price", "median"),
          avg_bedrooms=("bed", "mean"),
          avg_bathrooms=("bath", "mean"),
      )
      .reset_index()
      .rename(columns={"sold_year": "year"})
)

# ---------- RE by state + year ----------
agg_re_state_year = (
    re.groupby(["state", "sold_year"])
      .agg(
          n_properties=("price", "count"),
          avg_price=("price", "mean"),
          median_price=("price", "median"),
          avg_bedrooms=("bed", "mean"),
          avg_bathrooms=("bath", "mean"),
      )
      .reset_index()
      .rename(columns={"sold_year": "year"})
)

for df in [agg_re_county_year, agg_re_state_year]:
    df["price_per_bedroom"] = df["avg_price"] / df["avg_bedrooms"]
    df["price_per_bathroom"] = df["avg_price"] / df["avg_bathrooms"]


# ======================================================
# ==========         MERGE AGGREGATIONS     ============
# ======================================================

# ---------- COUNTY + YEAR ----------
agg_county_year = agg_eq_county_year.merge(
    agg_re_county_year,
    on=["county", "year"],
    how="outer"
)

agg_county_year["eq_per_100_properties"] = (
    agg_county_year["n_earthquakes"] /
    (agg_county_year["n_properties"] / 100).replace({0: None})
)

agg_county_year = agg_county_year.sort_values(
    by="intensity_score", ascending=False
)

# ---------- STATE + YEAR ----------
agg_state_year = agg_eq_state_year.merge(
    agg_re_state_year,
    on=["state", "year"],
    how="outer"
)

agg_state_year["eq_per_100_properties"] = (
    agg_state_year["n_earthquakes"] /
    (agg_state_year["n_properties"] / 100).replace({0: None})
)

agg_state_year = agg_state_year.sort_values(
    by="intensity_score", ascending=False
)


# ---------------------------
# 5. SAVE RESULTS
# ---------------------------

agg_county_year.to_csv("data/agg_county_year.csv", index=False)
agg_state_year.to_csv("data/agg_state_year.csv", index=False)

print("✔ Saved: data/agg_county_year.csv")
print("✔ Saved: data/agg_state_year.csv")

print("✔ County+Year sample:")
print(agg_county_year.head())

print("✔ State+Year sample:")
print(agg_state_year.head())

✔ Saved: data/agg_county_year.csv
✔ Saved: data/agg_state_year.csv
✔ County+Year sample:
               county  year  n_earthquakes  avg_magnitude  max_magnitude  \
58664      Sweetwater  1995            1.0       5.180000           5.18   
704    Aleutians West  1991            3.0       4.966667           5.40   
61046      Tuscaloosa  1999            1.0       4.800000           4.80   
705    Aleutians West  1993            6.0       4.583333           5.10   
706    Aleutians West  1994            3.0       4.433333           5.00   

       avg_depth  intensity_score  n_properties  avg_price  median_price  \
58664  -1.400000             5.18           NaN        NaN           NaN   
704    58.900000             5.14           NaN        NaN           NaN   
61046   1.000000             4.80           1.0   285000.0      285000.0   
705    34.633333             4.79           NaN        NaN           NaN   
706    30.966667             4.66           NaN        NaN           NaN  