In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import collections


pd.set_option('display.max_columns', None) # Show all columns

## Reading in AHRQ SDOH data

In [None]:
path = 'data/sdoh_2020_tract_1_0.xlsx'
dfs = pd.read_excel(
        path,
        sheet_name = None,
        dtype = str,
        na_filter = False,
        nrows = 1000,
)

for name, df in dfs.items():
    print(name)
    with pd.option_context('display.max_columns', None):
        display(df.head())

layout_df = dfs['Layout']

See what columns are not numbers

In [None]:
def scope():
    nonnumerics = []
    for name, row in layout_df.iterrows():
        if row['type'] != 'num':
            nonnumerics.append(name)

    for name in nonnumerics:
        print(f'        {name!r}: str,')

scope()

Read the entire file this time and default all columns to having float values with some exceptions. Takes around 3 minutes.

In [None]:
'''
#importing pandas as pd 
import pandas as pd 

# Read and store content 
# of an excel file 
read_file = pd.read_excel ("../data/ahrq/sdoh_2020_tract_1_0.xlsx", sheet_name="Data") 

# Write the dataframe object 
# into csv file 
read_file.to_csv ("../data/ahrq/sdoh_2020_tract_1_0_data.csv", 
				index = None, 
				header=True) 
	
# read csv file and convert 
# into a dataframe object 
df = pd.DataFrame(pd.read_csv("../data/ahrq/sdoh_2020_tract_1_0_data.csv")) 

# show the dataframe 
df
'''

In [None]:
'''
data_df = pd.read_excel(
    path,
    sheet_name = (
        'Data'
    ),
    dtype = collections.defaultdict(lambda: float) | {
        'TRACTFIPS': str,
        'COUNTYFIPS': str,
        'STATEFIPS': str,
        'STATE': str,
        'COUNTY': str,
        'REGION': str,
        'CEN_AIAN_NH_IND': str,
    },
    na_filter = True,
    na_values=['', ' '],
)
'''

path = 'data/sdoh_2020_tract_1_0_data.csv'
data_df = pd.read_csv(
    path,
    dtype = collections.defaultdict(lambda: float) | {
        'TRACTFIPS': str,
        'COUNTYFIPS': str,
        'STATEFIPS': str,
        'STATE': str,
        'COUNTY': str,
        'REGION': str,
        'CEN_AIAN_NH_IND': str,
    },
    na_filter = True,
    na_values=['', ' '],
)
display(data_df)

In [None]:
# Display info on the data columns
data_df.info()

In [None]:
# Generate statistics on the data columns
df.describe()

## Merge with TN census tract shapefile

In [None]:
shapefile = gpd.read_file('data/tl_2020_47_tract/tl_2020_47_tract.shp')
display(shapefile.head())

In [None]:
# After performing the merge
merged = data_df.merge(shapefile[['GEOID', 'geometry']], left_on='TRACTFIPS', right_on='GEOID', how='left')

# Reorder the columns to move 'GEOID' and 'geometry' to the front
columns = ['GEOID', 'geometry'] + [col for col in merged.columns if col not in ['GEOID', 'geometry']]
merged = merged[columns]

gdf = gpd.GeoDataFrame(merged, geometry='geometry')
tennessee_gdf = gdf[gdf["GEOID"].astype(str).str.startswith('47')]
display(tennessee_gdf)

Filter down to the columns I initially think will be useful to career prediction.

In [None]:
filtered_tennessee_gdf = tennessee_gdf[
    [
        'GEOID',
        'geometry',
        'STATE',
        'COUNTY',
        'COUNTYFIPS',
        'CEN_POPDENSITY_TRACT',

        'ACS_MEDIAN_AGE',
        'ACS_PCT_AGE_18_29',
        'ACS_PCT_AGE_30_44', 
        'ACS_PCT_AGE_45_64',

        'ACS_PCT_LT_HS',
        'ACS_PCT_HS_GRADUATE',
        'ACS_PCT_COLLEGE_ASSOCIATE_DGR',   
        'ACS_PCT_BACHELOR_DGR',
        'ACS_PCT_GRADUATE_DGR',
        'ACS_PCT_POSTHS_ED',
        
        'ACS_MEDIAN_HH_INC',
        'ACS_MEDIAN_INC_F',
        'ACS_MEDIAN_INC_M',
        
        'ACS_PCT_EMPLOYED',
        'ACS_PCT_NOT_LABOR',
        'ACS_PCT_UNEMPLOY',

        'ACS_PCT_ADMIN',
        'ACS_PCT_ARMED_FORCES',
        'ACS_PCT_ART',
        'ACS_PCT_CONSTRUCT',
        'ACS_PCT_EDUC',
        'ACS_PCT_FINANCE',
        'ACS_PCT_GOVT',
        'ACS_PCT_INFORM',
        'ACS_PCT_MANUFACT',
        'ACS_PCT_NATURE',
        'ACS_PCT_PROFESS',
        'ACS_PCT_RETAIL',
        'ACS_PCT_TRANSPORT',
        'ACS_PCT_WHOLESALE',
    ]
]

display(filtered_tennessee_gdf)


In [None]:
filtered_tennessee_gdf.info()

In [None]:
filtered_tennessee_gdf.describe()

In [None]:
old_shape = filtered_tennessee_gdf.shape
print(old_shape)
for i in filtered_tennessee_gdf:
    print(f"{i}: {filtered_tennessee_gdf[i].isna().sum()}")

Honestly it's not a lot of data to lose. Let's just drop the nan values.

In [None]:
clean_filtered_tennessee_gdf = filtered_tennessee_gdf.dropna()
deleted_rows = old_shape[0] - clean_filtered_tennessee_gdf.shape[0]
print(f"Old shape: {old_shape}")
print(f"New shape: {clean_filtered_tennessee_gdf.shape}")
print(f"Deleted rows: {deleted_rows}")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, classification_report, f1_score, accuracy_score, mean_absolute_percentage_error

In [None]:
df = clean_filtered_tennessee_gdf.copy()

# Drop geometry column for machine learning purposes
df_ml = df.drop(columns=['geometry'])


## Create career-specific features
# Create dictionary mappings for career fields to relevant features
career_field_features = {
    'armed forces' : ['ACS_PCT_ARMED_FORCES'],
    'arts (entertainment, recreation, accommodation, and food services)' : ['ACS_PCT_ART'],
    'construction' : ['ACS_PCT_CONSTRUCT'],
    'educational services (healthcare, social assistance)' : ['ACS_PCT_EDUC'],
    'finance (insurance, real estate, rental/leasing)' : ['ACS_PCT_FINANCE'],
    'government' : ['ACS_PCT_GOVT'],
    'informaton services' : ['ACS_PCT_INFORM'],
    'manufacturing' : ['ACS_PCT_MANUFACT'],
    'nature (agriculature, forestry, fishing, hunting, mining)' : ['ACS_PCT_NATURE'],
    'professional (scientific, management, administrative, and waste management)' : ['ACS_PCT_PROFESS'],
    'public administration' : ['ACS_PCT_ADMIN'],
    'retail' : ['ACS_PCT_RETAIL'],
    'transportation (warehousing, utilities)' : ['ACS_PCT_TRANSPORT'],
    'wholesale' : ['ACS_PCT_WHOLESALE']
}

# Create education level features
education_levels = {
    'less_than_high_school': ['ACS_PCT_LT_HS'],
    'high_school': ['ACS_PCT_HS_GRADUATE'],
    'some_college/associates': ['ACS_PCT_COLLEGE_ASSOCIATE_DGR'],
    'bachelors': ['ACS_PCT_BACHELOR_DGR'],
    'masters/doctorate': ['ACS_PCT_GRADUATE_DGR'],   
    'postsecondary': ['ACS_PCT_POSTHS_ED']
}


In [None]:
def train_career_suitability_model():
    """
    Trains a random forest model to predict career suitability scores for counties.
    
    Returns:
        tuple: (trained model, list of feature names used)
    """
    # Define features and target for each county
    feature_vectors = []
    labels = []
    
    # Define career weights for creating a composite target score
    career_weights = {
        career: i/len(career_field_features) 
        for i, career in enumerate(career_field_features.keys(), 1)
    }
    
    # Loop through each county
    for _, county in clean_filtered_tennessee_gdf.iterrows():
        # Extract features that don't leak information about our target
        features = [
            county['ACS_MEDIAN_HH_INC'],
            county['CEN_POPDENSITY_TRACT'] if 'CEN_POPDENSITY_TRACT' in county else 0,
            county['ACS_MEDIAN_AGE'] if 'ACS_MEDIAN_AGE' in county else 0,
            # Add education level features
            county['ACS_PCT_LT_HS'] if 'ACS_PCT_LT_HS' in county else 0,
            county['ACS_PCT_HS_GRADUATE'] if 'ACS_PCT_HS_GRADUATE' in county else 0,
            county['ACS_PCT_COLLEGE_ASSOCIATE_DGR'] if 'ACS_PCT_COLLEGE_ASSOCIATE_DGR' in county else 0,
            county['ACS_PCT_BACHELOR_DGR'] if 'ACS_PCT_BACHELOR_DGR' in county else 0,
            county['ACS_PCT_GRADUATE_DGR'] if 'ACS_PCT_GRADUATE_DGR' in county else 0,
            county['ACS_PCT_POSTHS_ED'] if 'ACS_PCT_POSTHS_ED' in county else 0,
        ]
        # Add career-specific features - but don't include the actual target columns
        for career, columns in career_field_features.items():
            # Use only the first feature from each career field to avoid data leakage
            # and potential collinearity issues
            if columns and columns[0] in county:
                features.append(county[columns[0]])
            else:
                features.append(0)  # Default value if column doesn't exist
        
        feature_vectors.append(features)
        
        # Create a composite target that represents overall career suitability
        # This is better than using employment rate directly as it creates a more
        # meaningful target that combines multiple factors
        target_score = 0.5 * county['ACS_PCT_EMPLOYED']  # Base score from employment
        
        # Add career-specific components to the target
        for career, columns in career_field_features.items():
            if columns and columns[0] in county:
                # Weight each career field differently to create variety in predictions
                target_score += career_weights[career] * county[columns[0]] * 0.5
                
        labels.append(target_score)

    # Convert to numpy arrays
    X = np.array(feature_vectors)
    y = np.array(labels)

    test_size = 0.3
    x_train_reg, x_test_reg, y_train_reg, y_test_reg = train_test_split(X, y, test_size=test_size, random_state=42)
    
    # Feature names for interpretability
    feature_names = [
        'ACS_MEDIAN_HH_INC', 
        'CEN_POPDENSITY_TRACT', 
        'ACS_MEDIAN_AGE',
        'ACS_PCT_LT_HS',
        'ACS_PCT_HS_GRADUATE',
        'ACS_PCT_COLLEGE_ASSOCIATE_DGR',
        'ACS_PCT_BACHELOR_DGR',
        'ACS_PCT_GRADUATE_DGR',
        'ACS_PCT_POSTHS_ED'
    ]
    
    
    # Train model with cross-validation to avoid overfitting
    model = RandomForestRegressor(
        n_estimators=100, 
        max_depth=10,  # Prevent overfitting
        min_samples_split=5,
        random_state=42
    )
    
    # Check model performance with cross-validation
    cv_scores = cross_val_score(model, x_train_reg, y_train_reg, cv=5, scoring='r2')
    print(f"Cross-validation R² scores: {cv_scores}")
    print(f"Mean R² score: {cv_scores.mean():.3f}")
    
    model.fit(x_train_reg, y_train_reg)
    y_pred_reg = model.predict(x_test_reg)
    mae = mean_absolute_error(y_test_reg, y_pred_reg)
    mse = mean_squared_error(y_test_reg, y_pred_reg)
    mape = mean_absolute_percentage_error(y_test_reg, y_pred_reg)
    r2 = r2_score(y_test_reg, y_pred_reg)

    print("mae: ",mae)
    print("mse: ",mse)
    print("mape: ",mape)
    print("r2: ",r2)

    return model, feature_names


def predict_with_model(model, feature_names, selected_career, selected_education_level):
    """
    Use the trained model to predict suitability scores for each county based on
    selected career and education level.
    
    Args:
        model: Trained RandomForestRegressor model
        feature_names: List of feature names used in training
        selected_career: Career field selected by the user
        selected_education_level: Education level selected by the user
        
    Returns:
        DataFrame with counties and their predicted scores
    """
    prediction_data = []
    
    # Extract education level weights for career adjustment
    education_weight = {
        'less_than_high_school': 0.5,
        'high_school': 0.6,
        'some_college/associates': 0.8,
        'bachelors': 1.0,
        'masters/doctorate': 1.1,
        'postsecondary': 1.2
    }.get(selected_education_level, 1.0)
    
    # Loop through each county
    for _, county in clean_filtered_tennessee_gdf.iterrows():
        # Build feature vector in the same order as during training
        features = []
        for f in feature_names:
            features.append(county.get(f, 0))
        

        # Career field features
        for career in career_field_features.keys():
            columns = career_field_features[career]
            if columns and columns[0] in county:
                features.append(county[columns[0]])
            else:
                features.append(0)
        
        prediction_data.append({
            'GEOID': county['GEOID'],
            'COUNTY': county['COUNTY'],
            'COUNTYFIPS': county['COUNTYFIPS'],
            'features': features
        })
    
    # Build prediction dataframe
    prediction_df = pd.DataFrame(prediction_data)
    
    # Actually predict
    features_matrix = np.vstack(prediction_df['features'])
    base_preds = model.predict(features_matrix)
    
    # Apply career-specific and education-level adjustments
    prediction_df['predicted_score'] = base_preds
    
    # Boost scores based on selected career field
    if selected_career in career_field_features:
        career_columns = career_field_features[selected_career]
        
        if career_columns and career_columns[0] in clean_filtered_tennessee_gdf.columns:
            # Join to get the career-specific employment rate
            career_data = clean_filtered_tennessee_gdf[['GEOID', career_columns[0]]]
            prediction_df = pd.merge(prediction_df, career_data, on='GEOID')
            
            # Apply a career-specific boost
            career_boost = prediction_df[career_columns[0]] / prediction_df[career_columns[0]].mean()
            prediction_df['predicted_score'] = prediction_df['predicted_score'] * (0.7 + 0.3 * career_boost)
    
    # Apply education-level adjustment
    edu_columns = education_levels.get(selected_education_level, [])
    if edu_columns and edu_columns[0] in clean_filtered_tennessee_gdf.columns:
        # Join to get the education match rate
        edu_data = clean_filtered_tennessee_gdf[['COUNTYFIPS', edu_columns[0]]]
        prediction_df = pd.merge(prediction_df, edu_data, on='COUNTYFIPS')
        
        # Apply an education-level adjustment
        edu_match = prediction_df[edu_columns[0]] / prediction_df[edu_columns[0]].mean()
        prediction_df['predicted_score'] = prediction_df['predicted_score'] * (0.8 + 0.2 * edu_match * education_weight)
    
    # Normalize scores to 0-100 range for interpretability
    min_score = prediction_df['predicted_score'].min()
    max_score = prediction_df['predicted_score'].max()
    prediction_df['predicted_score'] = 100 * (prediction_df['predicted_score'] - min_score) / (max_score - min_score)
    
    return prediction_df.sort_values(by='predicted_score', ascending=False)

In [None]:

def main():
    # Train and use ML model
    print("\nTraining machine learning model...")
    model, feature_names = train_career_suitability_model()
    
    print("\nAvailable career fields:")
    for i, career in enumerate(career_field_features.keys(), 1):
        print(f"{i}. {career.title()}")
    
    # Get career field input
    career_idx = int(input("\nEnter the number for your desired career field: ")) - 1
    career_field = list(career_field_features.keys())[career_idx]
    
    # Display available education levels
    print("\nAvailable education levels:")
    for i, edu_level in enumerate(education_levels.keys(), 1):
        print(f"{i}. {edu_level.replace('_', ' ').title()}")
    
    # Get education level input
    edu_idx = int(input("\nEnter the number for your education level: ")) - 1
    education_level = list(education_levels.keys())[edu_idx]
    
    # Get predictions
    predictions = predict_with_model(model, feature_names, career_field, education_level)
    



    # Display top 10 counties
    print("\nTop 10 recommended counties for your career (ML predictions):")
    top_counties = predictions.head(10)
    display(top_counties)
    
    
    # Create a copy of the complete Tennessee GeoDataFrame
    tn_plot_gdf = clean_filtered_tennessee_gdf.copy()

    print("clean_filtered_tennessee_gdf shape:", clean_filtered_tennessee_gdf.shape)
    print("tn_plot_gdf shape:", tn_plot_gdf.shape)
    print("predictions shape:", predictions.shape)
    print(predictions.head())
    

    # Merge the predictions with the complete Tennessee GeoDataFrame
    tn_plot_gdf = pd.merge(
        tn_plot_gdf,
        predictions[['GEOID', 'predicted_score']],
        on=['GEOID'],
        how='left'  # Important: use left join to keep all counties
    )
    
    
    # Fill NaN values for counties without predictions
    #tn_plot_gdf['predicted_score'] = tn_plot_gdf['predicted_score'].fillna(-1)
    print("NAN values still: ", tn_plot_gdf['predicted_score'].isna().sum())
    display(tn_plot_gdf)


    # Plotting the counties with the predicted scores
    fig, ax = plt.subplots(figsize=(12, 8))
    tn_plot_gdf.plot(column='predicted_score', cmap='viridis', legend=True, ax=ax)
    plt.title(f'Career Suitability Scores for {career_field.title()} with {education_level.title()} by County')
    plt.axis('off')  # Hide axis for clean visualization
    plt.show()

In [None]:
main()