In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

In [10]:
combined_data = pd.read_csv("county.csv")
# Normalize numeric columns using MinMaxScaler
numeric_cols = ["Poverty_Percent","Bachelor_Or_Higher","Unemployment_Rate","Median_Income","Avg_Temp","Avg_Precipitation","Crime_Rate_Per_100000","Walkability","Population_Density"]
county = combined_data.copy()
clean_data_norm = combined_data.copy()
clean_data_norm.drop(columns=["Population_Estimate","Land_Area"], inplace=True)
clean_data_norm[numeric_cols] = MinMaxScaler().fit_transform(combined_data[numeric_cols])
print(clean_data_norm.head())

   Unnamed: 0         area_name state     FP  Poverty_Percent  \
0           0  ABBEVILLE COUNTY    SC  45001         0.259179   
1           1     ACADIA PARISH    LA  22001         0.468683   
2           2   ACCOMACK COUNTY    VA  51001         0.302376   
3           3        ADA COUNTY    ID  16001         0.101512   
4           4      ADAIR COUNTY    IA  19001         0.140389   

   Bachelor_Or_Higher  Unemployment_Rate  Median_Income  Avg_Temp  \
0            0.001359           0.200000       0.154025  0.676744   
1            0.002068           0.200000       0.101051  0.813953   
2            0.002167           0.188235       0.197536  0.553488   
3            0.062549           0.141176       0.423968  0.406977   
4            0.000429           0.123529       0.274285  0.355814   

   Avg_Precipitation  Crime_Rate_Per_100000  Walkability  Population_Density  
0           0.368911               0.285636     0.172999            0.001007  
1           0.612079               0

In [None]:
data_matrix = clean_data_norm[numeric_cols]

model_knn = NearestNeighbors(metric = "euclidean", algorithm = "brute")
model_knn.fit(data_matrix)

In [4]:
import warnings
warnings.filterwarnings('ignore')

np.random.seed(1)
query_no = np.random.choice(clean_data_norm.shape[0]) # random county index
print(f"We will find recommendations for the county {clean_data_norm.iloc[query_no]['area_name'].title()}, {clean_data_norm.iloc[query_no]['state']}.")
distances, indices = model_knn.kneighbors(data_matrix.iloc[query_no, :].values.reshape(1, -1), n_neighbors=6)

We will find recommendations for the county Grand Forks County, ND.


In [38]:
def get_user_preferences():    
# Map columns to user-friendly questions
    features = {
        "Poverty_Percent": ("I prefer living in an area with a lower poverty rate.", "poverty rate"),
        "Bachelor_Or_Higher": ("I prefer living in areas with highly educated populations", "higher education"),
        "Unemployment_Rate": ("I prefer living in regions with lower unemployment rates", "unemployment rate"),
        "Median_Income": ("I prefer living in generally more affluent areas", "median income"),
        "Avg_Temp": ("I prefer warmer climates", "temperature"),
        "Avg_Precipitation": ("I prefer seeing less rain and less snow", "rain/snow"),
        "Crime_Rate_Per_100000": ("I prefer living in an area with a lower crime rate", "crime rate"), 
        "Walkability": ("I prefer walking over other modes of transportation", "walkability"),
        "Population_Density": ("I prefer living in more densely populated regions", "urban lifestyle")
    }

    pref_values = []
    importance_values = []

    for col, (pref_question, feature) in features.items():
        while True:
            try:
                # Preference question
                pref = int(input(f"On a scale of 1-5, how much do you agree with the statement: {pref_question}"))
                if 1 <= pref <= 5:
                    # We have to invert to make sense with some of our questions.
                    if col in ["Poverty_Percent","Unemployment_Rate","Avg_Precipitation","Crime_Rate_Per_100000"]:
                        pref = 6 - pref
                    break
                else:
                    print("Please enter a number from 1 to 5.")
            except ValueError:
                print("Invalid input. Please enter an integer.")
        
        while True:
            try:
                # Preference question
                importance = int(input(f"On a scale of 1-5, how important is {feature} when choosing a place to live?"))
                if 1 <= importance <= 5:
                    break
                else:
                    print("Please enter a number from 1 to 5.")
            except ValueError:
                print("Invalid input. Please enter an integer.")
        
        pref_values.append(pref)
        importance_values.append(importance)

    # Normalize both to [0,1]
    pref_values = np.array(pref_values).reshape(-1, 1)
    importance_values = np.array(importance_values).reshape(-1, 1)

    user_embedding = (pref_values.flatten() - 1) / 4  # 1 maps to 0, 5 maps to 1
    weight_vector = (importance_values.flatten() - 1) / 4

    # Show output
    print("\nUser Target Embedding (1 = wanting more of):")
    for (col, _), val in zip(features.items(), user_embedding):
        print(f"{col:30}: {val:.3f}")

    print("\nWeight Vector (how much each feature matters):")
    for (col, _), val in zip(features.items(), weight_vector):
        print(f"{col:30}: {val:.3f}")

    return user_embedding, weight_vector

In [61]:

def get_user_based_recommendations(target_embedding, weight_vector,n_neighbors=6):
    # Scale features by sqrt of weights
    sqrt_weights = np.sqrt(weight_vector)
    weighted_data = data_matrix * sqrt_weights  # broadcasted element-wise multiplication

    # Fit new KNN model on the weighted data
    model_knn = NearestNeighbors(n_neighbors=n_neighbors, metric='euclidean')
    model_knn.fit(weighted_data.values)
    user_embedding_weighted = target_embedding * np.sqrt(weight_vector)
    distances, indices = model_knn.kneighbors(user_embedding_weighted.reshape(1, -1), n_neighbors=n_neighbors)
    no = []
    name = []
    state = []
    distance = []
    population = []
    poverty = []
    education = []
    unemployment = []
    crime_rate = []
    income = []
    walkability = []
    for i in range(0, len(distances.flatten())):
        if i == 0:
            print(f"Recommendations for {clean_data_norm.iloc[query_no]['area_name'].title()} residents:\n")
        else:
            no.append(i)
            name.append(county['area_name'][indices.flatten()[i]].title())
            state.append(county['state'][indices.flatten()[i]])
            distance.append(distances.flatten()[i])
            population.append(county['Population_Estimate'][indices.flatten()[i]])
            poverty.append(county['Poverty_Percent'][indices.flatten()[i]])
            education.append(county['Bachelor_Or_Higher'][indices.flatten()[i]])
            unemployment.append(county['Unemployment_Rate'][indices.flatten()[i]])
            crime_rate.append(county['Crime_Rate_Per_100000'][indices.flatten()[i]])
            income.append(county['Median_Income'][indices.flatten()[i]])
            walkability.append(county['Walkability'][indices.flatten()[i]])
    dic = {"No": no, "County Name": name, "State": state, "Distance": distance,
        "Population Estimate": population, "Poverty Percent": poverty,
        "Bachelor's Degree or Higher": education,
        "Unemployment Rate (%)": unemployment,
        "Crime Rate per 100,000": crime_rate,
        "Median Income": income, "Walkability Index": walkability}
    recommendation = pd.DataFrame(data=dic)
    recommendation.set_index("No", inplace=True)
    return recommendation.style.set_properties(**{"background-color": "white", "color": "black", "border": "1.5px solid black"})

In [47]:
def get_random_embeddings():
    # Simulate a user giving 1-5 responses
    random_prefs = np.random.randint(1, 6, size=9)
    random_importance = np.random.randint(1, 6, size=9)

    # Normalize to [0,1]
    user_embedding = (random_prefs - 1) / 4
    weight_vector = (random_importance - 1) / 4

    print("\nUser Target Embedding (1 = want more of):")
    for col, val in zip(numeric_cols, user_embedding):
        print(f"{col:30}: {val:.3f}")

    print("\nWeight Vector (how much each feature matters):")
    for col, val in zip(numeric_cols, weight_vector):
        print(f"{col:30}: {val:.3f}")

    return user_embedding, weight_vector

In [60]:
user_embedding, weight_vector = get_random_embeddings()
get_user_based_recommendations(user_embedding, weight_vector)


User Target Embedding (1 = want more of):
Poverty_Percent               : 0.000
Bachelor_Or_Higher            : 0.500
Unemployment_Rate             : 0.750
Median_Income                 : 0.250
Avg_Temp                      : 0.750
Avg_Precipitation             : 0.500
Crime_Rate_Per_100000         : 0.750
Walkability                   : 0.500
Population_Density            : 0.500

Weight Vector (how much each feature matters):
Poverty_Percent               : 0.000
Bachelor_Or_Higher            : 1.000
Unemployment_Rate             : 0.000
Median_Income                 : 0.250
Avg_Temp                      : 0.750
Avg_Precipitation             : 0.000
Crime_Rate_Per_100000         : 0.000
Walkability                   : 0.000
Population_Density            : 0.250
Recommendations for Grand Forks County residents:



Unnamed: 0_level_0,County Name,State,Distance,Population Estimate,Poverty Percent,Bachelor's Degree or Higher,Unemployment Rate (%),"Crime Rate per 100,000",Median Income,Walkability Index
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Maricopa County,AZ,0.284485,4585871.0,11.1,1094926.0,3.4,410.21,83668.0,4.513149
2,Kings County,NY,0.287498,2561225.0,19.1,764053.0,5.5,630.25,73244.0,10.267822
3,Orange County,CA,0.291158,3135755.0,9.2,957048.0,3.6,217.99,106047.0,8.655218
4,San Diego County,CA,0.300694,3269973.0,10.1,955076.0,3.9,368.67,98365.0,6.128342
5,Dallas County,TX,0.341174,2606358.0,13.8,590082.0,3.8,442.97,70871.0,9.7024
