In [None]:
from google.colab import files
uploaded = files.upload()

Saving ev_data.csv to ev_data (1).csv


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

ev_data = pd.read_csv("ev_data.csv")
metric_col = "EV_perc"

m4 = ev_data.dropna(subset=[metric_col]).copy()

desert_threshold = np.nanpercentile(m4[metric_col], 10)

m4["ev_desert"] = (m4[metric_col] <= desert_threshold).astype(int)

print("Metric used for deserts:", metric_col)
print(f"Desert threshold (10th percentile): {desert_threshold}")
print(f"Share of ZIPs that are deserts: {m4['ev_desert'].mean():.3f}")
print("Final sample size:", len(m4))

feature_cols = [
    "Latino_perc",
    "White_perc",
    "Black_perc",
    "Asian_perc",
    "Median_Household_Income",
    "BachOrHigher_perc",
    "Zillow_Home_Value_Index",
]

X = m4[feature_cols].copy()
y = m4["ev_desert"].copy()

mask = X.notna().all(axis=1) & y.notna()
X = X[mask]
y = y[mask]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=141,
    stratify=y
)

model4 = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("logit", LogisticRegression(max_iter=1000)),
    ]
)

model4.fit(X_train, y_train)

y_pred = model4.predict(X_test)
y_prob = model4.predict_proba(X_test)[:, 1]

print("\nClassification report (Model 4: EV desert vs non-desert):")
print(classification_report(y_test, y_pred))

roc = roc_auc_score(y_test, y_prob)
print(f"ROC AUC: {roc:.12f}")

logit = model4.named_steps["logit"]
coefs = logit.coef_[0]

coef_df = pd.DataFrame({
    "feature": feature_cols,
    "coef": coefs,
    "odds_ratio": np.exp(coefs),
}).sort_values("coef", ascending=False)

print("\nStandardized coefficients and odds ratios:")
print(coef_df.to_string(index=False))


Metric used for deserts: EV_perc
Desert threshold (10th percentile): 0.4059999999999999
Share of ZIPs that are deserts: 0.100
Final sample size: 1427

Classification report (Model 4: EV desert vs non-desert):
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       251
           1       0.80      0.64      0.71        25

    accuracy                           0.95       276
   macro avg       0.88      0.81      0.84       276
weighted avg       0.95      0.95      0.95       276

ROC AUC: 0.970039840637

Standardized coefficients and odds ratios:
                feature      coef  odds_ratio
            Latino_perc  0.136688    1.146470
             White_perc  0.052973    1.054401
             Black_perc -0.591999    0.553220
Median_Household_Income -1.367140    0.254835
             Asian_perc -1.777626    0.169039
      BachOrHigher_perc -2.004434    0.134736
Zillow_Home_Value_Index -2.633824    0.071803
