In [1]:
# ==============================================
# 1. Import Libraries
# ==============================================

import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from scipy.stats import linregress
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [3]:
# ==============================================
# 2. Define State Abbreviation Mapping
# ==============================================

# Mapping of state abbreviations to full state names in lowercase
state_abbrev_to_name = {
    'al': 'alabama', 'ak': 'alaska', 'az': 'arizona', 'ar': 'arkansas',
    'ca': 'california', 'co': 'colorado', 'ct': 'connecticut', 'de': 'delaware',
    'fl': 'florida', 'ga': 'georgia', 'hi': 'hawaii', 'id': 'idaho',
    'il': 'illinois', 'in': 'indiana', 'ia': 'iowa', 'ks': 'kansas',
    'ky': 'kentucky', 'la': 'louisiana', 'me': 'maine', 'md': 'maryland',
    'ma': 'massachusetts', 'mi': 'michigan', 'mn': 'minnesota', 'ms': 'mississippi',
    'mo': 'missouri', 'mt': 'montana', 'ne': 'nebraska', 'nv': 'nevada',
    'nh': 'new hampshire', 'nj': 'new jersey', 'nm': 'new mexico', 'ny': 'new york',
    'nc': 'north carolina', 'nd': 'north dakota', 'oh': 'ohio', 'ok': 'oklahoma',
    'or': 'oregon', 'pa': 'pennsylvania', 'ri': 'rhode island', 'sc': 'south carolina',
    'sd': 'south dakota', 'tn': 'tennessee', 'tx': 'texas', 'ut': 'utah',
    'vt': 'vermont', 'va': 'virginia', 'wa': 'washington', 'wv': 'west virginia',
    'wi': 'wisconsin', 'wy': 'wyoming', 'dc': 'district of columbia',
    'as': 'american samoa', 'gu': 'guam', 'mp': 'northern mariana islands',
    'pr': 'puerto rico', 'vi': 'virgin islands', 'us': 'united states'
}

# Define territories and non-continental states to exclude
exclude_states = [
    'puerto rico', 'guam', 'virgin islands', 'american samoa',
    'northern mariana islands', 'alaska', 'hawaii', 'united states'
]

exclude_states_lower = [state.lower() for state in exclude_states]


In [5]:
# ==============================================
# 3. Load and Process Cargo Fraud Data
# ==============================================

# Load the cargo fraud data
cargo_fraud = pd.read_csv('cargo_fraud_only.csv')

# Check if 'data_year' column exists
if 'data_year' in cargo_fraud.columns:
    # Use 'data_year' as 'Year'
    cargo_fraud['Year'] = cargo_fraud['data_year'].astype(int)
else:
    # Extract 'Year' from 'date_recovered' or another date column
    cargo_fraud['date_recovered'] = pd.to_datetime(cargo_fraud['date_recovered'], errors='coerce')
    cargo_fraud['Year'] = cargo_fraud['date_recovered'].dt.year
    cargo_fraud = cargo_fraud.dropna(subset=['Year'])
    cargo_fraud['Year'] = cargo_fraud['Year'].astype(int)

# Ensure the 'State' column exists and matches the 'State' in other datasets
if 'state_name' in cargo_fraud.columns:
    cargo_fraud['State'] = cargo_fraud['state_name'].str.lower().str.strip()
elif 'state_abbr' in cargo_fraud.columns:
    # Map state abbreviations to full state names
    cargo_fraud['State'] = cargo_fraud['state_abbr'].str.lower().map(state_abbrev_to_name)
else:
    raise KeyError("No 'state_name' or 'state_abbr' column found in cargo_fraud DataFrame.")

# Exclude rows with missing 'State' or 'Year'
cargo_fraud = cargo_fraud.dropna(subset=['State', 'Year'])

# Exclude territories and non-continental states
cargo_fraud = cargo_fraud[~cargo_fraud['State'].isin(exclude_states_lower)]

# Display the processed cargo fraud data
print("\nProcessed Cargo Fraud Data:")
print(cargo_fraud[['State', 'Year']].head())


Processed Cargo Fraud Data:
     State  Year
0  florida  2012
1  florida  2012
2  florida  2012
3  florida  2012
4  florida  2012


In [7]:
# ==============================================
# 4. Load and Process HPI Data
# ==============================================

# Load HPI data
hpi_data_url = "https://raw.githubusercontent.com/ryantangmj/ryantangmj.github.io/main/hpi_by_state.csv"
hpi_data = pd.read_csv(hpi_data_url)

# Keep relevant columns
hpi_data = hpi_data[["State", "Year", "HPI"]]

# Filter years between 2012 and 2022
hpi_data = hpi_data[(hpi_data["Year"] >= 2012) & (hpi_data["Year"] <= 2022)].reset_index(drop=True)

# Standardize 'State' names
hpi_data['State'] = hpi_data['State'].str.lower().str.strip()

# Map state abbreviations to full names if necessary
if hpi_data['State'].str.len().max() == 2:
    hpi_data['State'] = hpi_data['State'].map(state_abbrev_to_name)

# Exclude territories and non-continental states
hpi_data = hpi_data[~hpi_data['State'].isin(exclude_states_lower)]

# Convert 'Year' to integer
hpi_data['Year'] = hpi_data['Year'].astype(int)

# Display the first few rows
print("\nProcessed HPI Data:")
print(hpi_data.head())



Processed HPI Data:
     State  Year     HPI
0  alabama  2012  341.58
1  alabama  2013  339.66
2  alabama  2014  344.12
3  alabama  2015  352.11
4  alabama  2016  361.39


In [9]:
# ==============================================
# 5. Load and Process Poverty Data
# ==============================================

# Load poverty data
poverty_data = pd.read_csv('poverty_data.csv')  # Replace with the actual file path

# Reshape poverty data to long format
id_vars = ['State']
value_vars = [col for col in poverty_data.columns if col != 'State']

poverty_long = pd.melt(poverty_data, id_vars=id_vars, value_vars=value_vars,
                       var_name='Variable', value_name='Value')

# Extract 'Year' from the 'Variable' column
poverty_long['Year'] = poverty_long['Variable'].str.extract('(\d{4})', expand=False).astype(int)
poverty_long['Variable_Name'] = poverty_long['Variable'].str.replace(' \d{4}', '', regex=True).str.strip()

# Pivot the data to have one row per 'State' and 'Year'
poverty_pivot = poverty_long.pivot_table(index=['State', 'Year'], columns='Variable_Name', values='Value', aggfunc='first').reset_index()
poverty_pivot.columns.name = None

# Standardize 'State' names
poverty_pivot['State'] = poverty_pivot['State'].str.lower().str.strip()

# Convert numeric columns to float
numeric_cols = ['Total population', 'Number in poverty', 'Percentage poverty']
for col in numeric_cols:
    poverty_pivot[col] = poverty_pivot[col].astype(str).replace('nan', '').str.replace(',', '').str.replace('%', '')
    poverty_pivot[col] = poverty_pivot[col].replace('', pd.NA)
    poverty_pivot[col] = poverty_pivot[col].astype(float)

# Rename columns for clarity
poverty_pivot.rename(columns={
    'Total population': 'Total_Population',
    'Number in poverty': 'Number_in_Poverty',
    'Percentage poverty': 'Poverty_Rate'
}, inplace=True)

# Exclude territories and non-continental states
poverty_pivot = poverty_pivot[~poverty_pivot['State'].isin(exclude_states_lower)]

# Display the first few rows
print("\nProcessed Poverty Data:")
print(poverty_pivot.head())


Processed Poverty Data:
     State  Year  Number_in_Poverty  Poverty_Rate  Total_Population
0  alabama  2012              777.0          16.2            4808.0
1  alabama  2013              891.0          18.5            4807.0
2  alabama  2014              848.0          17.8            4765.0
3  alabama  2015              784.0          16.3            4820.0
4  alabama  2016              782.0          16.2            4821.0


In [10]:
# ==============================================
# 6. Load and Process Homelessness Data
# ==============================================

# Load homelessness data
homelessness_data_url = "https://raw.githubusercontent.com/ryantangmj/ryantangmj.github.io/main/homeless_data.csv"
homelessness_data = pd.read_csv(homelessness_data_url)

# Map state abbreviations to full state names
homelessness_data['State'] = homelessness_data['State'].str.lower().map(state_abbrev_to_name)

# Exclude rows with missing 'State'
homelessness_data = homelessness_data.dropna(subset=['State'])

# Exclude territories and non-continental states
homelessness_data = homelessness_data[~homelessness_data['State'].isin(exclude_states_lower)]

# Rename columns for years (Assuming columns have year information)
new_column_names = {
    col: col.split(', ')[-1].split('-')[0] for col in homelessness_data.columns if 'Change in Total Homelessness' in col
}
homelessness_data.rename(columns=new_column_names, inplace=True)

# Melt the DataFrame to long format
df_homelessness = pd.melt(homelessness_data, id_vars=['State'], var_name='Year', value_name='homeless_rate_change')

# Clean 'homeless_rate_change' column
df_homelessness['homeless_rate_change'] = df_homelessness['homeless_rate_change'].replace(' ', np.nan)
df_homelessness = df_homelessness.dropna(subset=['homeless_rate_change'])
df_homelessness['homeless_rate_change'] = df_homelessness['homeless_rate_change'].str.replace('%', '').astype(float)
df_homelessness['Year'] = df_homelessness['Year'].astype(int)

# Display the first few rows
print("\nProcessed Homelessness Data:")
print(df_homelessness.head())


Processed Homelessness Data:
        State  Year  homeless_rate_change
0     alabama  2022                 -11.9
1    arkansas  2022                   6.1
2     arizona  2022                   5.0
3  california  2022                   5.8
4    colorado  2022                  38.9


In [11]:
# ==============================================
# 7. Load and Process Education Data
# ==============================================

# Load education data
education_data = pd.read_csv('education.csv')  # Replace with the actual file path

# Display the first few rows to verify the structure
print("\nInitial Education Data:")
print(education_data.head())

# Identify columns related to education metrics for years 2012-2022
education_cols = [col for col in education_data.columns if any(str(year) in col for year in range(2012, 2023))]

# Keep 'State' and the identified education columns
education_data = education_data[['State'] + education_cols]

# Melt the data to long format
education_long = pd.melt(
    education_data,
    id_vars=['State'],
    value_vars=education_cols,
    var_name='Variable',
    value_name='Value'
)

# Extract 'Year' and 'Education_Variable' from the 'Variable' column
education_long['Year'] = education_long['Variable'].str.extract('(\d{4})', expand=False).astype(int)
education_long['Education_Variable'] = education_long['Variable'].str.replace(' \d{4}', '', regex=True).str.strip()

# Drop rows where 'Year' is NaN
education_long = education_long.dropna(subset=['Year'])

# Pivot the data to have one row per 'State' and 'Year'
education_pivot = education_long.pivot_table(
    index=['State', 'Year'],
    columns='Education_Variable',
    values='Value',
    aggfunc='first'
).reset_index()

# Flatten the columns
education_pivot.columns.name = None

# Standardize 'State' names
education_pivot['State'] = education_pivot['State'].str.lower().str.strip()

# Map state abbreviations to full state names if necessary
if education_pivot['State'].str.len().max() == 2:
    education_pivot['State'] = education_pivot['State'].map(state_abbrev_to_name)

# Exclude territories and non-continental states
education_pivot = education_pivot[~education_pivot['State'].isin(exclude_states_lower)]

# Convert numeric columns to float
numeric_cols = [col for col in education_pivot.columns if col not in ['State', 'Year']]
for col in numeric_cols:
    # Convert to string and remove commas and percent signs
    education_pivot[col] = education_pivot[col].astype(str).str.replace(',', '').str.replace('%', '')
    # Convert to numeric, coercing errors to NaN
    education_pivot[col] = pd.to_numeric(education_pivot[col], errors='coerce')

# Rename columns for clarity (Adjust based on actual column names)
# Example:
# education_pivot.rename(columns={
#     "Percentage with Bachelor's Degree": 'Bachelor_Degree_Rate'
# }, inplace=True)

# Exclude rows with missing 'State' or 'Year' after mapping
education_pivot = education_pivot.dropna(subset=['State', 'Year'])

# Display the first few rows of the processed education data
print("\nProcessed Education Data (2012-2022):")
print(education_pivot.head())



Initial Education Data:
   FIPS Code State       Area name  2003 Urban Influence Code  \
0          0    US   United States                        NaN   
1       1000    AL         Alabama                        NaN   
2       1001    AL  Autauga County                        2.0   
3       1003    AL  Baldwin County                        5.0   
4       1005    AL  Barbour County                        6.0   

   2013 Urban Influence Code  2013 Rural-urban Continuum Code  \
0                        NaN                              NaN   
1                        NaN                              NaN   
2                        2.0                              2.0   
3                        2.0                              3.0   
4                        6.0                              6.0   

   2023 Rural-urban Continuum Code Less than a high school diploma, 1970  \
0                              NaN                            52,373,312   
1                              NaN       