# Analyse compl√®te : S√©ismes & Prix Immobiliers aux √âtats-Unis  
Ce notebook regroupe :
- Deux cartes US (s√©ismes & prix immobiliers)
- Un scatter avec r√©gression
- Une analyse High/Low earthquake group√©e
- Une visualisation centr√©e (taille de points)

In [9]:
import pandas as pd
import altair as alt
import numpy as np
import us
from vega_datasets import data
import os

# Load dataset
df = pd.read_excel("data/Data_aggregated_v2.xlsx")
df = df.dropna(subset=['Total Earthquake', 'Median Price', 'Year'])

# Useful renaming
df['price'] = df['Median Price']
df['earthquake'] = df['Total Earthquake']

df.head()

Unnamed: 0,State,Year,Total Earthquake,Average Magnitude,Average Significance,Total Tsunami,Average House Price,Median Price,Average Lot,Average Size,Total Sales,price,earthquake
0,Massachusetts,2009,1,2.3,85.0,0,861476.2,545000.0,0.894483,1999.938202,181.0,545000.0,1
1,Massachusetts,2011,1,2.1,76.0,0,1231142.0,519900.0,10.3025,1666.218182,57.0,519900.0,1
2,Massachusetts,2000,1,3.3,168.0,0,1212126.0,619995.0,1.109106,2601.492647,143.0,619995.0,1
3,Massachusetts,2003,1,2.98,137.0,0,784597.4,569950.0,1.895687,2167.260638,192.0,569950.0,1
4,Massachusetts,2013,1,2.3,81.0,0,588468.5,460000.0,1.591905,1592.470588,35.0,460000.0,1


## G√©n√©ration des codes FIPS pour les cartes US

In [10]:
# FIPS codes
df["fips"] = df["State"].apply(lambda x: int(us.states.lookup(x).fips))

# Fill NaNs
df["Total Earthquake_filled"] = df["earthquake"].fillna(0)
df["Median_Price_filled"] = df["price"].fillna(0)

# TopoJSON
us_states = alt.topo_feature(data.us_10m.url, "states")

df.head()

Unnamed: 0,State,Year,Total Earthquake,Average Magnitude,Average Significance,Total Tsunami,Average House Price,Median Price,Average Lot,Average Size,Total Sales,price,earthquake,fips,Total Earthquake_filled,Median_Price_filled
0,Massachusetts,2009,1,2.3,85.0,0,861476.2,545000.0,0.894483,1999.938202,181.0,545000.0,1,25,1,545000.0
1,Massachusetts,2011,1,2.1,76.0,0,1231142.0,519900.0,10.3025,1666.218182,57.0,519900.0,1,25,1,519900.0
2,Massachusetts,2000,1,3.3,168.0,0,1212126.0,619995.0,1.109106,2601.492647,143.0,619995.0,1,25,1,619995.0
3,Massachusetts,2003,1,2.98,137.0,0,784597.4,569950.0,1.895687,2167.260638,192.0,569950.0,1,25,1,569950.0
4,Massachusetts,2013,1,2.3,81.0,0,588468.5,460000.0,1.591905,1592.470588,35.0,460000.0,1,25,1,460000.0


## S√©lection de l‚Äôann√©e

In [11]:
year = 2015
df_year = df[df["Year"] == year]
df_year.head()

Unnamed: 0,State,Year,Total Earthquake,Average Magnitude,Average Significance,Total Tsunami,Average House Price,Median Price,Average Lot,Average Size,Total Sales,price,earthquake,fips,Total Earthquake_filled,Median_Price_filled
60,Maryland,2015,1,1.91,78.0,0,502597.752182,399000.0,1.448458,2218.116105,573.0,399000.0,1,24,1,399000.0
86,Indiana,2015,1,2.33,88.0,0,311113.325967,249900.0,1.55514,2355.750733,363.0,249900.0,1,18,1,249900.0
106,Massachusetts,2015,2,1.65,48.5,0,624176.190476,500000.0,0.936154,1820.55,21.0,500000.0,2,25,2,500000.0
122,Rhode Island,2015,2,2.15,96.5,0,507355.028986,384900.0,0.554655,1782.590909,69.0,384900.0,2,44,2,384900.0
135,Pennsylvania,2015,2,1.935,73.0,0,367729.31297,272450.0,1.174404,1956.004274,1064.0,272450.0,2,42,2,272450.0


# Carte US : Intensit√© des S√©ismes et Prix M√©dian des Maisons

In [12]:
heatmap_eq = alt.Chart(us_states).mark_geoshape().encode(
    color=alt.Color(
        "Total Earthquake_filled:Q",
        scale=alt.Scale(range=["#ffe6e6", "#800000"]),
        title="Earthquake Count"
    ),
    tooltip=["State:N", "Total Earthquake_filled:Q"]
).transform_lookup(
    lookup="id",
    from_=alt.LookupData(df_year, "fips", ["Total Earthquake_filled", "State"])
).project("albersUsa").properties(
    title=f"Earthquakes ({year})", width=400, height=300
)

heatmap_price = alt.Chart(us_states).mark_geoshape().encode(
    color=alt.Color(
        "Median_Price_filled:Q",
        scale=alt.Scale(range=["#e6f2ff", "#0055aa"]),
        title="Median House Price"
    ),
    tooltip=["State:N", "Median_Price_filled:Q"]
).transform_lookup(
    lookup="id",
    from_=alt.LookupData(df_year, "fips", ["Median_Price_filled", "State"])
).project("albersUsa").properties(
    title=f"Median Price ({year})", width=400, height=300
)

(heatmap_eq | heatmap_price).resolve_scale(color="independent")

# Corr√©lation S√©ismes ‚Üî Prix

## üîé Analyse de la Relation entre l‚ÄôActivit√© Sismique et les Prix Immobiliers  
### Une visualisation combinant √©chelle logarithmique, intensit√© sismique et tendance g√©n√©rale

Cette visualisation explore comment le nombre de s√©ismes dans un √âtat am√©ricain est associ√© au prix m√©dian de l‚Äôimmobilier.  
Elle combine plusieurs √©l√©ments compl√©mentaires :

- **Scatter plot** : chaque point repr√©sente un √âtat pour l‚Äôann√©e s√©lectionn√©e.  
- **√âchelle logarithmique sur l‚Äôaxe des X** : permet de visualiser correctement des niveaux de sismicit√© tr√®s diff√©rents (de 1 √† plusieurs centaines).  
- **Couleur continue (Viridis)** : encode l‚Äôintensit√© des s√©ismes, renfor√ßant la lecture des variations.  
- **Regression Lin√©aire** : une tendance lin√©aire qui met en √©vidence la relation g√©n√©rale.  

L‚Äôobjectif final est d‚Äôoffrir une lecture claire, √©quilibr√©e et robuste de la relation potentielle entre l‚Äôactivit√© sismique et les prix domiciliaires, malgr√© la grande variabilit√© des √âtats.

In [13]:
# --- Prepare data ---
df_corr = df_year[["State", "earthquake", "price"]].dropna()
df_corr = df_corr[df_corr["earthquake"] > 0]

color_scale = alt.Scale(scheme="viridis")

# Scatter
scatter = (
    alt.Chart(df_corr)
    .mark_circle(size=90, opacity=0.75)
    .encode(
        x=alt.X(
            "earthquake:Q",
            title="Earthquakes (log scale)",
            scale=alt.Scale(type="log"),
            axis=alt.Axis(format="~s")
        ),
        y=alt.Y(
            "price:Q",
            title="Median House Price ($)",
            scale=alt.Scale(zero=False, padding=10)
        ),
        color=alt.Color("earthquake:Q", title="Earthquake Intensity", scale=color_scale),
        tooltip=[
            alt.Tooltip("State:N"),
            alt.Tooltip("earthquake:Q", title="Earthquakes"),
            alt.Tooltip("price:Q", title="Median Price ($)")
        ]
    )
)

# LOESS trendline
smooth = (
    alt.Chart(df_corr)
    .transform_regression("earthquake", "price")
    .mark_line(color="black", size=3)
    .encode(
        x=alt.X("earthquake:Q", scale=alt.Scale(type="log")),
        y="price:Q"
    )
)

# Combine
correlation = (
    (scatter + smooth)
    .properties(
        width=900,
        height=550,
        title="Relationship between Earthquake Frequency and House Prices (Log Scale)"
    )
    .configure_axis(
        gridOpacity=0.20,
        labelFontSize=12,
        titleFontSize=14
    )
    .configure_title(
        fontSize=22,
        anchor="start"
    )
)

correlation

# Classification des √âtats : High vs Low Earthquake
Seuls les √âtats dans les quantiles 20% et 80% sont conserv√©s.

In [14]:
state_totals = df.groupby('State')['earthquake'].sum().sort_values()

threshold_high = state_totals.quantile(0.8)
threshold_low = state_totals.quantile(0.2)

def classify_state(state):
    total = state_totals[state]
    if total >= threshold_high:
        return 'High Earthquake'
    elif total <= threshold_low:
        return 'Low Earthquake'
    else:
        return 'Medium'

df['quake_group'] = df['State'].apply(classify_state)
df_selected = df[df['quake_group'].isin(['High Earthquake', 'Low Earthquake'])]

df_selected.head()

Unnamed: 0,State,Year,Total Earthquake,Average Magnitude,Average Significance,Total Tsunami,Average House Price,Median Price,Average Lot,Average Size,Total Sales,price,earthquake,fips,Total Earthquake_filled,Median_Price_filled,quake_group
10,Oklahoma,2000,1,2.6,104.0,0,241992.946903,180000.0,3.007117,1840.193878,113.0,180000.0,1,40,1,180000.0,High Earthquake
11,Louisiana,2000,1,4.3,284.0,0,424893.0,319000.0,1.721071,2338.295082,65.0,319000.0,1,22,1,319000.0,Low Earthquake
15,Louisiana,2001,1,3.6,199.0,0,564450.0,347500.0,1.120678,2570.181818,68.0,347500.0,1,22,1,347500.0,Low Earthquake
21,Rhode Island,2017,1,1.6,39.0,0,466763.495575,379900.0,0.513163,1835.981818,113.0,379900.0,1,44,1,379900.0,Low Earthquake
22,Rhode Island,2020,1,1.9,58.0,0,512693.081967,364450.0,185.14451,2082.563025,122.0,364450.0,1,44,1,364450.0,Low Earthquake


# Visualisation centr√©e (taille des points normalis√©e)

In [15]:
quake_scaled = df_selected['earthquake'].copy()
quake_scaled = np.sqrt(quake_scaled)
quake_scaled = (quake_scaled - quake_scaled.min()) / (quake_scaled.max() - quake_scaled.min())
quake_scaled = quake_scaled * 200 + 20
df_selected['quake_size'] = quake_scaled

median_prices = (
    df_selected.groupby(['Year', 'quake_group'])
    .agg({'price': 'median'})
    .reset_index()
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['quake_size'] = quake_scaled


# Evolution des prix selon High / Low Earthquake

In [16]:
import altair as alt
import numpy as np
import pandas as pd

# Calculer min et max par ann√©e
yearly_extremes = df_selected.groupby('Year').agg(
    price_min=('price','min'),
    price_max=('price','max')
).reset_index()

# D√©finir buffer pour ne pas √©craser le graphique
buffer = 100000
ymin = yearly_extremes['price_min'].min() - buffer
ymax = yearly_extremes['price_max'].max() + buffer

# Filtrer scatter pour ne pas d√©passer le min/max ¬± buffer
df_filtered = df_selected[(df_selected['price'] >= ymin) & (df_selected['price'] <= ymax)].copy()

# Chart
chart = alt.layer(
    # Scatter points
    alt.Chart(df_filtered).mark_circle().encode(
        x='Year:O',
        y='price:Q',
        size=alt.Size('quake_size:Q', legend=None),
        color=alt.value('#ff7f0e'),
        tooltip=['State:N', 'Year:O', 'earthquake:Q', 'price:Q']
    ),
    # Courbe prix max
    alt.Chart(yearly_extremes).mark_line(size=3, color='#d62728').encode(
        x='Year:O',
        y='price_max:Q',
        tooltip=['Year:O','price_max:Q']
    ),
    # Courbe prix min
    alt.Chart(yearly_extremes).mark_line(size=3, color='#1f77b4').encode(
        x='Year:O',
        y='price_min:Q',
        tooltip=['Year:O','price_min:Q']
    )
).properties(
    width=900,
    height=500,
    title='House Prices Over Time ‚Äì Min and Max by Year'
).encode(
    y=alt.Y('price:Q', scale=alt.Scale(domain=[ymin, ymax]))
).configure_axis(
    labelFontSize=12,
    titleFontSize=14,
    gridOpacity=0.2
).configure_title(
    fontSize=18,
    anchor='start'
)

chart

In [24]:
# Assumer que df_selected contient toutes les ann√©es
df_corr_all = df_selected[["State", "Year", "earthquake", "price"]].dropna()
df_corr_all = df_corr_all[df_corr_all["earthquake"] > 0]

color_scale = alt.Scale(scheme="viridis")

# Scatter pour toutes les ann√©es
scatter_all = (
    alt.Chart(df_corr_all)
    .mark_circle(size=90, opacity=0.6)
    .encode(
        x=alt.X(
            "earthquake:Q",
            title="Earthquakes (log scale)",
            scale=alt.Scale(type="log"),
            axis=alt.Axis(format="~s")
        ),
        y=alt.Y(
            "price:Q",
            title="Median House Price ($)",
            scale=alt.Scale(zero=False, padding=10)
        ),
        color=alt.Color("earthquake:Q", title="Earthquake Intensity", scale=color_scale),
        tooltip=[
            alt.Tooltip("State:N"),
            alt.Tooltip("Year:O"),
            alt.Tooltip("earthquake:Q", title="Earthquakes"),
            alt.Tooltip("price:Q", title="Median Price ($)")
        ]
    )
)

# LOESS trendline sur toutes les ann√©es
smooth_all = (
    alt.Chart(df_corr_all)
    .transform_regression("earthquake", "price")
    .mark_line(color="black", size=3)
    .encode(
        x=alt.X("earthquake:Q", scale=alt.Scale(type="log")),
        y="price:Q"
    )
)

# Combinaison
correlation_all = (
    (scatter_all + smooth_all)
    .properties(
        width=900,
        height=550,
        title="Relationship between Earthquake Frequency and House Prices (All Years, Log Scale)"
    )
    .configure_axis(
        gridOpacity=0.20,
        labelFontSize=12,
        titleFontSize=14
    )
    .configure_title(
        fontSize=22,
        anchor="start"
    )
)
correlation_all

In [None]:
# Base scatter
base = alt.Chart(df_selected).mark_circle(size=60, opacity=0.5).encode(
    x=alt.X("earthquake:Q", scale=alt.Scale(type="log"), title="Earthquakes (log scale)"),
    y=alt.Y("price:Q", title="Median House Price ($)"),
    tooltip=["State:N", "Year:O", "earthquake:Q", "price:Q"]
)

# R√©gression lin√©aire par ann√©e
smooth = alt.Chart(df_selected).transform_regression(
    "earthquake", "price", groupby=["Year"]  # par ann√©e
).mark_line(size=2, color="black").encode(
    x=alt.X("earthquake:Q", scale=alt.Scale(type="log")),
    y="price:Q"
)

# Scatter + ligne
layered = base + smooth

# Facet par ann√©e
chart = layered.facet(
    column=alt.Column("Year:O", title="Year")
).properties(
    title="House Prices vs Earthquakes ‚Äì Linear Trend per Year"
).configure_axis(
    labelFontSize=12,
    titleFontSize=14,
    gridOpacity=0.2
).configure_title(
    fontSize=18,
    anchor="start"
)

chart

# Focus sur un √©tat (Californie ou autre)