In [1]:
import sys
sys.path.insert(1, '../utils/')
import county_matcher as cm

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

#### Specify # of Bins Used in Dropdowns Across All Questions:

In [3]:
num_bins = 5 #value == 5 for first iteration of this project

#### Import County Information:

In [4]:
# Get county data:
county_df = pd.read_csv("../data/interim/county_matcher_data.csv.gz") # test using only terrain data

# Show:
county_df.head()

Unnamed: 0,FIPS,County,State,Total Population,Population Density (Per Sq. Mile),% Pop Density D,% Children Under 10,% Children Under 10 D,% Children 10 and Older,% Children 10 and Older D,...,Unity Churches,Unity of the Brethren,Universal Fellowship of Metropolitan Community Churches,Unnamed: 66,Vicariate for the Palestinian/Jordanian Orthodox Christian Communities,Vineyard USA,Wayne Trail Missionary Baptist Association,Wesleyan Church,Wisconsin Evangelical Lutheran Synod,Zoroastrian
0,12086,Miami-Dade County,Florida,2705528.0,1424.03,4.0,11.0,3.0,9.2,2.0,...,,,,,,,,,,
1,36081,Queens County,New York,2270976.0,20887.52,5.0,11.0,3.0,8.7,2.0,...,,,,0.08,,,,,,
2,34017,Hudson County,New Jersey,671923.0,14545.81,5.0,13.0,3.0,7.8,2.0,...,,,,0.1,,,,,,
3,2016,Aleutians West Census Area,Alaska,5708.0,1.3,1.0,10.0,2.0,7.6,2.0,...,,,,,,,,,,
4,6085,Santa Clara County,California,1924379.0,1490.51,4.0,12.0,3.0,10.0,2.0,...,,,0.02,0.07,,,,,0.08,


In [5]:
# Bin Counties Based on Values:
county_binned_df = county_df.copy()
county_binned_labeled_df = county_df.copy()
for col in county_binned_df.columns[3:]:
    try:
        county_binned_df[col] = pd.cut(county_binned_df[col],num_bins,labels=False,precision=1) #another option here is pd.qcut
        county_binned_labeled_df[col] = pd.cut(county_df[col],num_bins,precision=1) #another option here is pd.qcut
    except:
        pass
county_df = county_binned_df

# Show:
county_df.head()

Unnamed: 0,FIPS,County,State,Total Population,Population Density (Per Sq. Mile),% Pop Density D,% Children Under 10,% Children Under 10 D,% Children 10 and Older,% Children 10 and Older D,...,Unity Churches,Unity of the Brethren,Universal Fellowship of Metropolitan Community Churches,Unnamed: 66,Vicariate for the Palestinian/Jordanian Orthodox Christian Communities,Vineyard USA,Wayne Trail Missionary Baptist Association,Wesleyan Church,Wisconsin Evangelical Lutheran Synod,Zoroastrian
0,12086,Miami-Dade County,Florida,1,0,3,2,2,1,1,...,,,,,,,,,,
1,36081,Queens County,New York,1,1,4,2,2,1,1,...,,,,0.0,,,,,,
2,34017,Hudson County,New Jersey,0,1,4,2,2,1,1,...,,,,0.0,,,,,,
3,2016,Aleutians West Census Area,Alaska,0,0,0,1,1,1,1,...,,,,,,,,,,
4,6085,Santa Clara County,California,0,0,3,2,2,2,1,...,,,0.0,0.0,,,,,0.0,


In [6]:
# get_labels:
bin_data_lst = []
for col in county_binned_labeled_df.iloc[:,3:]:
    interval_lst = list(county_binned_labeled_df[col].sort_values().unique())
    interval_lst_formatted = [str(x) for x in interval_lst]
    bin_data_lst.append(interval_lst_formatted)
bin_df = pd.DataFrame(bin_data_lst,index=county_binned_labeled_df.columns[3:]).T
bin_df = bin_df.iloc[:num_bins,:]
# Show bin edge values::
bin_df


Unnamed: 0,Total Population,Population Density (Per Sq. Mile),% Pop Density D,% Children Under 10,% Children Under 10 D,% Children 10 and Older,% Children 10 and Older D,% Male,% Male: Under 5 Years,% Male: 5 to 9 Years,...,Unity Churches,Unity of the Brethren,Universal Fellowship of Metropolitan Community Churches,Unnamed: 66,Vicariate for the Palestinian/Jordanian Orthodox Christian Communities,Vineyard USA,Wayne Trail Missionary Baptist Association,Wesleyan Church,Wisconsin Evangelical Lutheran Synod,Zoroastrian
0,"(-9923.6, 2008230.0]","(-71.9, 14381.5]","(1.0, 1.8]","(-0.02, 5.0]","(1.0, 1.8]","(-0.02, 4.9]","(1.0, 1.8]","(42.0, 47.8]","(-0.01, 2.5]","(-0.01, 2.0]",...,,,"(-0.0002, 0.03]","(-0.009, 3.7]",,"(0.07998, 0.08002]",,"(0.0096, 0.082]","(-0.02, 4.7]",
1,"(2008230.0, 4016343.0]","(14381.5, 28763.0]","(1.8, 2.6]","(5.0, 10.0]","(1.8, 2.6]","(4.9, 9.8]","(1.8, 2.6]","(47.8, 53.6]","(2.5, 4.9]","(2.0, 4.0]",...,,,"(0.03, 0.06]","(3.7, 7.4]",,,,"(0.082, 0.15]","(4.7, 9.3]",
2,"(4016343.0, 6024456.0]","(28763.0, 43144.5]","(2.6, 3.4]","(10.0, 15.0]","(2.6, 3.4]","(9.8, 14.7]","(2.6, 3.4]","(53.6, 59.3]","(4.9, 7.4]","(4.0, 6.1]",...,,,"(0.06, 0.09]","(7.4, 11.2]",,,,"(0.23, 0.3]","(9.3, 14.0]",
3,"(8032569.0, 10040682.0]","(57525.9, 71907.4]","(3.4, 4.2]","(15.0, 20.0]","(3.4, 4.2]","(14.7, 19.6]","(3.4, 4.2]","(59.3, 65.1]","(7.4, 9.8]","(6.1, 8.1]",...,,,"(0.09, 0.1]","(11.2, 14.9]",,,,"(0.3, 0.37]","(14.0, 18.6]",
4,,,"(4.2, 5.0]","(20.0, 25.0]","(4.2, 5.0]","(19.6, 24.5]","(4.2, 5.0]","(65.1, 70.9]","(9.8, 12.3]","(8.1, 10.1]",...,,,"(0.1, 0.2]","(14.9, 18.6]",,,,,"(18.6, 23.3]",


In [7]:
interval_lst = county_binned_labeled_df[county_binned_labeled_df.columns[3]].sort_values().unique()
interval_lst

[(-9923.6, 2008230.0], (2008230.0, 4016343.0], (4016343.0, 6024456.0], (8032569.0, 10040682.0]]
Categories (5, interval[float64, right]): [(-9923.6, 2008230.0] < (2008230.0, 4016343.0] < (4016343.0, 6024456.0] < (6024456.0, 8032569.0] < (8032569.0, 10040682.0]]

#### Import User "Responses" and "Importances":

In [8]:
# Import data:
user_df = pd.read_excel("../data/external/user_responses/RDPM_Quiz_modified.xlsx",sheet_name="Upload")
question_feature_ref_df = pd.read_csv("../data/external/question_feature_ref.csv")
user_ref_df = user_df.merge(question_feature_ref_df,on="Question",how="left")
user_ref_df["max_importance"] = pd.read_excel("../data/external/user_responses/RDPM_Quiz_modified.xlsx",sheet_name="Dropdowns")["Importance"].max()

# Fill NA values with 0:
user_ref_df[['Response', 'Importance']] = user_ref_df[['Response', 'Importance']].fillna(value=0)
user_ref_df[["max_response"]] = num_bins #Note this should change if more bins exist for a question

# Special case for religion question:
try:
    user_ref_df.loc[user_ref_df["Question"]=="Rel-2)","county_column_ref"] = user_ref_df.loc[user_ref_df["Question"]=="Rel-2)"]["Response"].values[0]
    user_ref_df.loc[user_ref_df["Question"]=="Rel-2)","Response"] = num_bins
except:
    user_ref_df = user_ref_df[user_ref_df["county_column_ref"].notna()]

# Assign 0's to NA values in relevant religion columns in county
user_ref_df.head()

Unnamed: 0,Question,Measurement,Response,Importance,ILOC,county_column_ref,max_importance,max_response
0,Acs-1),0,0,0.0,0,Population Density (Per Sq. Mile),10.0,5
1,Acs-2),At least,2,10.0,1,% Children Under 10,10.0,5
2,Acs-3),0,5,10.0,2,% Children 10 and Older,10.0,5
3,Acs-4),Equal to,4,10.0,3,% Couples that are Same-Sex,10.0,5
4,Acs-5),0,1,10.0,4,% Population Over 25 with at Least a Bachelor ...,10.0,5


In [9]:
# For columns where the user selects "at_most..." or "at_least...", assign similar values for any counties matching the criteria:
at_least_cols = user_ref_df[user_df["Measurement"]=="At least"]["county_column_ref"].to_list()
at_least_vals = user_ref_df[user_df["Measurement"]=="At least"]["Response"].to_list()

for col_num in range(len(at_least_cols)):
    col_label = at_least_cols[col_num]
    county_df.loc[county_df[col_label]>=at_least_vals[col_num],col_label] = county_df[col_label].max()
    
at_most_cols = user_ref_df[user_df["Measurement"]=="At most"]["county_column_ref"].to_list()
at_most_vals = user_ref_df[user_df["Measurement"]=="At most"]["Response"].to_list()

for col_num in range(len(at_most_cols)):
    col_label = at_most_cols[col_num]
    county_df.loc[county_df[col_label]<=at_least_vals[col_num],col_label] = county_df[col_label].min()

In [10]:
# Relevant columns:
feature_cols = user_ref_df["county_column_ref"].to_list()

#### Find Closest Matching Counties:

In [11]:
n_neighbors=50 # number of neighbors to match
matched_counties= cm.county_matcher(county_df,user_ref_df,feature_cols,target_col="FIPS",n_neighbors = n_neighbors,metric="euclidean")

In [12]:
matched_counties

Unnamed: 0,FIPS,County,State,distance_metric
0,36005,Bronx County,New York,9.426823
1,36061,New York County,New York,9.488941
2,36081,Queens County,New York,9.523786
3,26163,Wayne County,Michigan,9.523786
4,36047,Kings County,New York,9.576142
5,51775,Salem city,Virginia,9.586579
6,44001,Bristol County,Rhode Island,9.602213
7,12129,Wakulla County,Florida,9.603515
8,12029,Dixie County,Florida,9.62172
9,34039,Union County,New Jersey,9.62172
