In [1]:
import sys
sys.path.insert(1, '../utils/')
import county_matcher as cm

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

#### Specify # of Bins Used in Dropdowns Across All Questions:

In [3]:
num_bins = 5 #value == 5 for first iteration of this project

#### Import County Information:

In [4]:
# Get county data:
county_df = pd.read_csv("../data/interim/county_matcher_data.csv.gz") # test using only terrain data

# Show:
county_df.head()

Unnamed: 0,FIPS,County,State,Total Population,Population Density (Per Sq. Mile),% Pop Density D,% Children Under 10,% Children Under 10 D,% Children 10 and Older,% Children 10 and Older D,...,Unity Churches,Unity of the Brethren,Universal Fellowship of Metropolitan Community Churches,Unnamed: 66,Vicariate for the Palestinian/Jordanian Orthodox Christian Communities,Vineyard USA,Wayne Trail Missionary Baptist Association,Wesleyan Church,Wisconsin Evangelical Lutheran Synod,Zoroastrian
0,12086,Miami-Dade County,Florida,2705528.0,1424.03,4.0,11.0,3.0,9.2,2.0,...,,,,,,,,,,
1,36081,Queens County,New York,2270976.0,20887.52,5.0,11.0,3.0,8.7,2.0,...,,,,0.08,,,,,,
2,34017,Hudson County,New Jersey,671923.0,14545.81,5.0,13.0,3.0,7.8,2.0,...,,,,0.1,,,,,,
3,2016,Aleutians West Census Area,Alaska,5708.0,1.3,1.0,10.0,2.0,7.6,2.0,...,,,,,,,,,,
4,6085,Santa Clara County,California,1924379.0,1490.51,4.0,12.0,3.0,10.0,2.0,...,,,0.02,0.07,,,,,0.08,


In [5]:
# Bin Counties Based on Values:
county_binned_df = county_df.copy()
for col in county_binned_df.columns[3:]:
    try:
        county_binned_df[col] = pd.cut(county_binned_df[col],bins=num_bins,labels=False)
    except:
        pass
county_df = county_binned_df
# Show:
county_df.head()

Unnamed: 0,FIPS,County,State,Total Population,Population Density (Per Sq. Mile),% Pop Density D,% Children Under 10,% Children Under 10 D,% Children 10 and Older,% Children 10 and Older D,...,Unity Churches,Unity of the Brethren,Universal Fellowship of Metropolitan Community Churches,Unnamed: 66,Vicariate for the Palestinian/Jordanian Orthodox Christian Communities,Vineyard USA,Wayne Trail Missionary Baptist Association,Wesleyan Church,Wisconsin Evangelical Lutheran Synod,Zoroastrian
0,12086,Miami-Dade County,Florida,1,0,3,2,2,1,1,...,,,,,,,,,,
1,36081,Queens County,New York,1,1,4,2,2,1,1,...,,,,0.0,,,,,,
2,34017,Hudson County,New Jersey,0,1,4,2,2,1,1,...,,,,0.0,,,,,,
3,2016,Aleutians West Census Area,Alaska,0,0,0,1,1,1,1,...,,,,,,,,,,
4,6085,Santa Clara County,California,0,0,3,2,2,2,1,...,,,0.0,0.0,,,,,0.0,


#### Import User "Responses" and "Importances":

In [6]:
# Import data:
user_df = pd.read_excel("../data/external/user_responses/RDPM_Quiz_modified.xlsx",sheet_name="Upload")
question_feature_ref_df = pd.read_csv("../data/external/question_feature_ref.csv")
user_ref_df = user_df.merge(question_feature_ref_df,on="Question",how="left")
user_ref_df["max_importance"] = pd.read_excel("../data/external/user_responses/RDPM_Quiz_modified.xlsx",sheet_name="Dropdowns")["Importance"].max()

# Fill NA values with 0:
user_ref_df[['Response', 'Importance']] = user_ref_df[['Response', 'Importance']].fillna(value=0)
user_ref_df[["max_response"]] = num_bins #Note this should change if more bins exist for a question

# Special case for religion question:
# 
try:
    user_ref_df.loc[user_ref_df["Question"]=="Rel-2)","county_column_ref"] = user_ref_df.loc[user_ref_df["Question"]=="Rel-2)"]["Response"].values[0]
    user_ref_df.loc[user_ref_df["Question"]=="Rel-2)","Response"] = num_bins
except:
    user_ref_df = user_ref_df[user_ref_df["county_column_ref"].notna()]

# Assign 0's to NA values in relevant religion columns in county
user_ref_df.head()

Unnamed: 0,Question,Measurement,Response,Importance,ILOC,county_column_ref,max_importance,max_response
0,Acs-1),0,0,0.0,0,Population Density (Per Sq. Mile),10.0,5
1,Acs-2),At least,2,10.0,1,% Children Under 10,10.0,5
2,Acs-3),0,5,10.0,2,% Children 10 and Older,10.0,5
3,Acs-4),Equal to,4,10.0,3,% Couples that are Same-Sex,10.0,5
4,Acs-5),0,1,10.0,4,% Population Over 25 with at Least a Bachelor ...,10.0,5


In [7]:
# For columns where the user selects "at_most..." or "at_least...", assign similar values for any counties matching the criteria:
at_least_cols = user_ref_df[user_df["Measurement"]=="At least"]["county_column_ref"].to_list()
at_least_vals = user_ref_df[user_df["Measurement"]=="At least"]["Response"].to_list()

for col_num in range(len(at_least_cols)):
    col_label = at_least_cols[col_num]
    county_df.loc[county_df[col_label]>=at_least_vals[col_num],col_label] = county_df[col_label].max()
    
at_most_cols = user_ref_df[user_df["Measurement"]=="At most"]["county_column_ref"].to_list()
at_most_vals = user_ref_df[user_df["Measurement"]=="At most"]["Response"].to_list()

for col_num in range(len(at_most_cols)):
    col_label = at_most_cols[col_num]
    county_df.loc[county_df[col_label]<=at_least_vals[col_num],col_label] = county_df[col_label].min()

In [8]:
# Relevant columns:
feature_cols = user_ref_df["county_column_ref"].to_list()

#### Find Closest Matching Counties:

In [9]:
n_neighbors=500 # number of neighbors to match
matched_counties = cm.county_matcher(county_df,user_ref_df,feature_cols,target_col="FIPS",n_neighbors = n_neighbors)

In [10]:
matched_counties

Unnamed: 0,FIPS,County,State
11,36005,Bronx County,New York
34,36061,New York County,New York
427,26163,Wayne County,Michigan
1,36081,Queens County,New York
9,36047,Kings County,New York
...,...,...,...
2428,47137,Pickett County,Tennessee
1579,5145,White County,Arkansas
1250,22089,St. Charles Parish,Louisiana
1576,47107,McMinn County,Tennessee
