In [2]:
# import libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import chi2
from sklearn.preprocessing import KBinsDiscretizer
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the data
df = pd.read_csv("Social_Vulnerability_Index_2018_-_United_States__tract_20250119.csv")

# # Check the data overview
print("Column names:")
print(df.columns)

print("\nSummary of 'RPL_THEMES' before preprocessing:")
print(df["RPL_THEMES"].describe())

In [None]:
# Keep and copy data where 'RPL_THEMES' is between 0 and 1 
df1 = df[df["RPL_THEMES"].between(0, 1)].copy()

print("\nSummary of 'RPL_THEMES' (after filtering):")
print(df1["RPL_THEMES"].describe())

In [16]:
# Divide 'RPL_THEMES' into three classes (low, medium, high) using tertiles 
df1['RPL_THEMES_BIN'] = pd.qcut(df1['RPL_THEMES'], q=3, labels=[0, 1, 2])

# Check the distribution of the new classes
print("\nDistribution of 'RPL_THEMES_BIN':")
print(df1['RPL_THEMES_BIN'].value_counts())



'RPL_THEMES_BIN'の分布:
RPL_THEMES_BIN
0    24060
2    24057
1    24056
Name: count, dtype: int64


In [18]:
# Define target and feature variables 
target_variable = 'RPL_THEMES_BIN'
feature_variables = ['EP_POV', 'EP_UNEMP', 'EP_NOHSDP', 'EP_MINRTY', 'EP_AGE65']
X = df1[feature_variables]
y = df1[target_variable].astype(int)  # Convert categorical to integer

# Check the first few rows of feature variables
print("\nFirst few rows of features:")
print(X.head())

# Standardize the data
X = (X - X.mean()) / X.std()

print("\nFirst few rows of target variable:")
print(y.head())

# Check for missing values
print("\nNumber of missing values:")
print(X.isnull().sum())

# Fill missing values with the median  
X = X.fillna(X.median())

# Check missing values again 
print("\nNumber of missing values (after imputation):")
print(X.isnull().sum())


特徴量の先頭:
                EP_POV          EP_UNEMP          EP_NOHSDP  \
24   8.500000000000000 8.900000000000000  2.000000000000000   
107  7.800000000000000 5.600000000000000 11.699999999999999   
198  8.000000000000000 2.600000000000000  8.699999999999999   
211 11.400000000000000 4.700000000000000  1.300000000000000   
233 17.899999999999999 2.100000000000000  8.800000000000001   

             EP_MINRTY           EP_AGE65  
24  20.300000000000001 11.199999999999999  
107 33.899999999999999 22.100000000000001  
198  1.700000000000000 21.000000000000000  
211  3.500000000000000 20.600000000000001  
233  4.200000000000000 29.699999999999999  

目的変数の先頭:
24     1
107    2
198    1
211    0
233    1
Name: RPL_THEMES_BIN, dtype: int64

欠損値の数:
EP_POV       0
EP_UNEMP     0
EP_NOHSDP    0
EP_MINRTY    0
EP_AGE65     0
dtype: int64

欠損値の数（補完後）:
EP_POV       0
EP_UNEMP     0
EP_NOHSDP    0
EP_MINRTY    0
EP_AGE65     0
dtype: int64


In [37]:
# Set float display to 8 decimal places  
pd.options.display.float_format = '{:0.15f}'.format

# Apply KBinsDiscretizer
kbd = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')
X_binned = kbd.fit_transform(X)
X_binned = pd.DataFrame(X_binned, columns=feature_variables)

print("\nFirst few rows of data after binning into 3 bins:")
print(X_binned.head())


連続変数を3ビンに分割したデータの先頭:
             EP_POV          EP_UNEMP         EP_NOHSDP         EP_MINRTY  \
0 1.000000000000000 2.000000000000000 0.000000000000000 1.000000000000000   
1 0.000000000000000 1.000000000000000 1.000000000000000 1.000000000000000   
2 0.000000000000000 0.000000000000000 1.000000000000000 0.000000000000000   
3 1.000000000000000 1.000000000000000 0.000000000000000 0.000000000000000   
4 2.000000000000000 0.000000000000000 1.000000000000000 0.000000000000000   

           EP_AGE65  
0 0.000000000000000  
1 2.000000000000000  
2 2.000000000000000  
3 2.000000000000000  
4 2.000000000000000  


In [38]:
# Perform Chi-square test 
chi2_stat, p_values = chi2(X_binned, y)

# Create a DataFrame for statistics and p-values
chi2_results = pd.DataFrame({
    'Feature': feature_variables,
    'Chi2 Statistic': chi2_stat,
    'p-value': p_values
})

# Sort by Chi-square statistic and get the top 5 
chi2_top5 = chi2_results.sort_values(by='Chi2 Statistic', ascending=False)

print("\nTop 5 important features based on Chi-square test:") 
print(chi2_top5[['Feature', 'Chi2 Statistic', 'p-value']])


カイ二乗検定による特徴量重要度（トップ5）:
     Feature        Chi2 Statistic           p-value
2  EP_NOHSDP 26418.683296464521845 0.000000000000000
0     EP_POV 26048.013400624549831 0.000000000000000
3  EP_MINRTY 13713.022703648970491 0.000000000000000
1   EP_UNEMP 13316.504565824296151 0.000000000000000
4   EP_AGE65  2287.042858585005888 0.000000000000000


In [39]:
# Calculate feature importance using Spearman correlation
spearman_results = []
for feature in feature_variables:
    corr, p = spearmanr(X[feature], y)
    spearman_results.append({
        'Feature': feature,
        'Spearman Correlation': corr,  # Signed correlation 
        'p-value': round(p, 8)
    })

spearman_df = pd.DataFrame(spearman_results)

# Sort by absolute correlation and get the top 5
spearman_top5 = spearman_df.reindex(
    spearman_df['Spearman Correlation'].abs().sort_values(ascending=False).index
)

print("\nTop 5 important features based on Spearman correlation:")  
print(spearman_top5[['Feature', 'Spearman Correlation', 'p-value']])


スピアマン相関による特徴量重要度（トップ5）:
     Feature  Spearman Correlation           p-value
2  EP_NOHSDP     0.777560089369926 0.000000000000000
0     EP_POV     0.769867580747304 0.000000000000000
1   EP_UNEMP     0.561995184353346 0.000000000000000
3  EP_MINRTY     0.546526127253726 0.000000000000000
4   EP_AGE65    -0.201356601681077 0.000000000000000


In [40]:
# Calculate feature importance using Kendall's Tau 
from scipy.stats import kendalltau

kendall_results = []

for feature in feature_variables:

    corr, p = kendalltau(X[feature], y)
    kendall_results.append({
        'Feature': feature,
        'Kendall Tau': corr,
        'p-value': round(p, 8)
    })

kendall_df = pd.DataFrame(kendall_results)

# Sort by absolute value and get the top 5
kendall_top5 = kendall_df.reindex(
    kendall_df['Kendall Tau'].abs().sort_values(ascending=False).index
)

print("\nTop 5 important features based on Kendall's Tau:") 
print(kendall_top5[['Feature', 'Kendall Tau', 'p-value']])



Kendall's Tauによる特徴量重要度（トップ7）:
     Feature        Kendall Tau           p-value
2  EP_NOHSDP  0.638781154620952 0.000000000000000
0     EP_POV  0.632952621889376 0.000000000000000
1   EP_UNEMP  0.446652595041508 0.000000000000000
3  EP_MINRTY  0.426741083447876 0.000000000000000
4   EP_AGE65 -0.155548319323142 0.000000000000000


In [41]:
# Build Random Forest model 
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)

# Get feature importances 
rf_importances = pd.Series(rf.feature_importances_, index=feature_variables)
rf_top5 = rf_importances.sort_values(ascending=False)

print("\nTop 5 important features based on Random Forest:") 
print(rf_top5)


Random Forest による特徴量重要度（トップ9）:
EP_POV      0.298084056150380
EP_NOHSDP   0.295770274360944
EP_MINRTY   0.176786814801612
EP_UNEMP    0.128568353794457
EP_AGE65    0.100790500892607
dtype: float64


In [42]:
from xgboost import XGBClassifier

# Build and train XGBoost model
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb.fit(X, y)

# Get feature importances  
xgb_importances = pd.Series(xgb.feature_importances_, index=feature_variables)
xgb_top5 = xgb_importances.sort_values(ascending=False)

print("\nTop 5 important features based on XGBoost:")  
print(xgb_top5)


Parameters: { "use_label_encoder" } are not used.




XGBoost による特徴量重要度（トップ9）:
EP_POV      0.463402390480042
EP_NOHSDP   0.352279692888260
EP_MINRTY   0.090007804334164
EP_UNEMP    0.059058733284473
EP_AGE65    0.035251379013062
dtype: float32


In [45]:
# Build SVM model
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Create pipeline
svc = make_pipeline(StandardScaler(), SVC(kernel='linear', random_state=42))
svc.fit(X, y)

# Get feature weights
svc_weights = pd.Series(svc.named_steps['svc'].coef_[0], index=feature_variables)
svc_top5 = svc_weights.abs().sort_values(ascending=False)

print("\nTop 5 important features based on SVM:")
print(svc_top5)


SVM による特徴量重要度（トップ9）:
EP_NOHSDP   2.156098304974876
EP_POV      1.815424765759417
EP_UNEMP    0.635572181303814
EP_MINRTY   0.597628749806063
EP_AGE65    0.180090928876780
dtype: float64


In [46]:
# Build Lasso regression model 
from sklearn.linear_model import LogisticRegression

# Create pipeline 
lasso = make_pipeline(StandardScaler(), LogisticRegression(penalty='l1', solver='liblinear', random_state=42))
lasso.fit(X, y)

# Get feature weights
lasso_weights = pd.Series(lasso.named_steps['logisticregression'].coef_[0], index=feature_variables)
lasso_top5 = lasso_weights.abs().sort_values(ascending=False)
print("\nTop 5 important features based on Lasso regression:")
print(lasso_top5)


Lasso回帰による特徴量重要度（トップ9）:
EP_NOHSDP   3.115316685555746
EP_POV      2.406547358206410
EP_UNEMP    0.919728992228639
EP_MINRTY   0.867641134813764
EP_AGE65    0.284622655895218
dtype: float64


In [47]:
# Build Naive Bayes model
from sklearn.naive_bayes import GaussianNB

# Create pipeline
nb = make_pipeline(StandardScaler(), GaussianNB())
nb.fit(X, y)

# Get feature weights
nb_weights = pd.Series(nb.named_steps['gaussiannb'].theta_[0], index=feature_variables)
nb_top5 = nb_weights.abs().sort_values(ascending=False)

print("\nTop 5 important features based on Naive Bayes:") 
print(nb_top5)


Naive Bayes による特徴量重要度（トップ9）:
EP_NOHSDP   0.753779592307598
EP_POV      0.745852869003636
EP_MINRTY   0.588756756022015
EP_UNEMP    0.550160000837189
EP_AGE65    0.173679567560513
dtype: float64


In [48]:
# Get top 5 features for each method in ranking order
top_features_rf = rf_importances.sort_values(ascending=False).index.tolist()
top_features_xgb = xgb_importances.sort_values(ascending=False).index.tolist()
top_features_svc = svc_weights.abs().sort_values(ascending=False).index.tolist()
top_features_lasso = lasso_weights.abs().sort_values(ascending=False).index.tolist()
top_features_nb = nb_weights.abs().sort_values(ascending=False).index.tolist()

# Create a rank list from 1 to 5  
rank = list(range(1, 6))

# Create a combined DataFrame  
df_rank = pd.DataFrame({
    'Rank': rank,
    'Random Forest': top_features_rf,
    'XGBoost': top_features_xgb,
    'Lasso': top_features_lasso,
    'SVM': top_features_svc,
    'Naive Bayes': top_features_nb
})

print("\nFeature ranking by each method:")  
print(df_rank)


各手法による特徴量ランキング:
   Rank Random Forest    XGBoost      Lasso        SVM Naive Bayes
0     1        EP_POV     EP_POV  EP_NOHSDP  EP_NOHSDP   EP_NOHSDP
1     2     EP_NOHSDP  EP_NOHSDP     EP_POV     EP_POV      EP_POV
2     3     EP_MINRTY  EP_MINRTY   EP_UNEMP   EP_UNEMP   EP_MINRTY
3     4      EP_UNEMP   EP_UNEMP  EP_MINRTY  EP_MINRTY    EP_UNEMP
4     5      EP_AGE65   EP_AGE65   EP_AGE65   EP_AGE65    EP_AGE65


In [49]:
# Get top 5 features for each method in ranking order
top_features_rf = rf_importances.sort_values(ascending=False).index.tolist()
top_features_chi2 = chi2_top5['Feature'].tolist()
top_features_spearman = spearman_top5['Feature'].tolist()

# Create a rank list from 1 to 5 
rank = [1, 2, 3, 4, 5]

# Get ranked features for each method 
features_rf_ranked = rf_importances.sort_values(ascending=False).index.tolist()
features_chi2_ranked = chi2_top5['Feature'].tolist()
features_spearman_ranked = spearman_top5['Feature'].tolist()

# Create a combined DataFrame 
rank_table = pd.DataFrame({
    'Rank': rank,
    'Random Forest Importance': features_rf_ranked,
    'Chi-Squared Statistic': features_chi2_ranked,
    'Spearman Correlation': features_spearman_ranked
})

# Set index to Rank
rank_table = rank_table.set_index('Rank')

print("\nCombined top 5 features from each method:")  
print(rank_table)


各手法のトップ5特徴量を統合した結果:
     Random Forest Importance Chi-Squared Statistic Spearman Correlation
Rank                                                                    
1                      EP_POV             EP_NOHSDP            EP_NOHSDP
2                   EP_NOHSDP                EP_POV               EP_POV
3                   EP_MINRTY             EP_MINRTY             EP_UNEMP
4                    EP_UNEMP              EP_UNEMP            EP_MINRTY
5                    EP_AGE65              EP_AGE65             EP_AGE65


項目
* EP_POV → 貧困率
* EP_NOHSDP → 高卒未満の割合
* EP_UNEMP → 失業率
* EP_MINRTY → 少数民族(白人、非ヒスパニック系を除くすべての人)
* EP_AGE65 → 65歳以上の割合

* RPL_THEMES → その地域の脆弱性を%で表す

In [50]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# VIFの計算関数
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data.sort_values(by='VIF', ascending=False)

# 最初のVIF計算
vif_data = calculate_vif(X)
print("\n初回のVIF:")
print(vif_data)


初回のVIF:
     feature               VIF
0     EP_POV 2.024807641178533
2  EP_NOHSDP 1.915279483833580
3  EP_MINRTY 1.840969557493327
1   EP_UNEMP 1.613642996203697
4   EP_AGE65 1.209273701564790


### 全ての説明変数がVIF < 5 なので、削除する必要はない。