In [1]:
import sys
sys.path.insert(1, '../utils/')
import county_matcher as cm

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

#### Specify # of Bins Used in Dropdowns Across All Questions:

In [3]:
num_bins = 5 #value == 5 for first iteration of this project

#### Import County Information:

In [4]:
# Get county data:
county_df = pd.read_csv("../data/interim/county_matcher_data.csv.gz") # test using only terrain data
# Get question ref data
question_feature_ref_df = pd.read_csv("../data/external/question_feature_ref.csv")
# Show:
county_df.head()

Unnamed: 0,FIPS,County,State,Total Population,Population Density (Per Sq. Mile),% Pop Density D,% Children Under 10,% Children Under 10 D,% Children 10 and Older,% Children 10 and Older D,...,Unity Churches,Unity of the Brethren,Universal Fellowship of Metropolitan Community Churches,Unnamed: 66,Vicariate for the Palestinian/Jordanian Orthodox Christian Communities,Vineyard USA,Wayne Trail Missionary Baptist Association,Wesleyan Church,Wisconsin Evangelical Lutheran Synod,Zoroastrian
0,12086,Miami-Dade County,Florida,2705528.0,1424.03,4.0,11.0,3.0,9.2,2.0,...,,,,,,,,,,
1,36081,Queens County,New York,2270976.0,20887.52,5.0,11.0,3.0,8.7,2.0,...,,,,0.08,,,,,,
2,34017,Hudson County,New Jersey,671923.0,14545.81,5.0,13.0,3.0,7.8,2.0,...,,,,0.1,,,,,,
3,2016,Aleutians West Census Area,Alaska,5708.0,1.3,1.0,10.0,2.0,7.6,2.0,...,,,,,,,,,,
4,6085,Santa Clara County,California,1924379.0,1490.51,4.0,12.0,3.0,10.0,2.0,...,,,0.02,0.07,,,,,0.08,


In [5]:
# Bin Counties Based on Values:
county_binned_df = county_df.copy()
county_binned_labeled_df = county_df.copy()
for col in county_binned_df.columns[3:]:
    try:
        # get distribution across bins:
        binned_series_temp = pd.cut(county_binned_df[col],num_bins,labels=False,precision=1) #equidistant bin edges
        num_counties= len(binned_series_temp) #total # of counties
        pct_by_bin = [len(binned_series_temp[binned_series_temp==x])/num_counties for x in list(range(num_bins))]
        
        #if the least populated bin contains at least 0.5% of the counties, use equal bin widths:
        if min(pct_by_bin)>=0.005: 
            county_binned_df[col] = binned_series_temp
            county_binned_labeled_df[col] = pd.cut(county_df[col],num_bins,precision=1) 

        else: #use quintile binning
            county_binned_df[col] = pd.qcut(county_binned_df[col],num_bins,labels=False,precision=1)
            county_binned_labeled_df[col] = pd.qcut(county_df[col],num_bins,precision=1) 

    except:
        pass
county_df = county_binned_df

# Show:
county_df.head()

Unnamed: 0,FIPS,County,State,Total Population,Population Density (Per Sq. Mile),% Pop Density D,% Children Under 10,% Children Under 10 D,% Children 10 and Older,% Children 10 and Older D,...,Unity Churches,Unity of the Brethren,Universal Fellowship of Metropolitan Community Churches,Unnamed: 66,Vicariate for the Palestinian/Jordanian Orthodox Christian Communities,Vineyard USA,Wayne Trail Missionary Baptist Association,Wesleyan Church,Wisconsin Evangelical Lutheran Synod,Zoroastrian
0,12086,Miami-Dade County,Florida,4,4,3,1,3.0,1,2.0,...,,,,,,,,,,
1,36081,Queens County,New York,4,4,4,1,3.0,0,2.0,...,,,,0.0,,,,,,
2,34017,Hudson County,New Jersey,4,4,4,3,3.0,0,2.0,...,,,,0.0,,,,,,
3,2016,Aleutians West Census Area,Alaska,0,0,0,0,2.0,0,2.0,...,,,,,,,,,,
4,6085,Santa Clara County,California,4,4,3,2,3.0,2,2.0,...,,,0.02,0.0,,,,,1.0,


In [6]:
# get_labels:
bin_data_lst = []
for col in county_binned_labeled_df.iloc[:,3:]:
    interval_lst = list(county_binned_labeled_df[col].sort_values().unique())
    interval_lst_formatted = [str(x) for x in interval_lst]
    bin_data_lst.append(interval_lst_formatted)
bin_df = pd.DataFrame(bin_data_lst,index=county_binned_labeled_df.columns[3:]).T
bin_df = bin_df.iloc[:num_bins,:]
# Show bin edge values::
bin_df


Unnamed: 0,Total Population,Population Density (Per Sq. Mile),% Pop Density D,% Children Under 10,% Children Under 10 D,% Children 10 and Older,% Children 10 and Older D,% Male,% Male: Under 5 Years,% Male: 5 to 9 Years,...,Unity Churches,Unity of the Brethren,Universal Fellowship of Metropolitan Community Churches,Unnamed: 66,Vicariate for the Palestinian/Jordanian Orthodox Christian Communities,Vineyard USA,Wayne Trail Missionary Baptist Association,Wesleyan Church,Wisconsin Evangelical Lutheran Synod,Zoroastrian
0,"(116.9, 8951.0]","(-0.060000000000000005, 12.3]","(1.0, 1.8]","(-0.1, 10.0]",1.0,"(-0.1, 9.0]",1.0,"(41.9, 48.6]","(-0.1, 2.4]","(-0.1, 2.5]",...,,,0.0,"(-0.09000000000000001, 0.2]",,0.079999998,,0.01,"(-0.1, 0.06]",
1,"(8951.0, 19060.0]","(12.3, 32.9]","(1.8, 2.6]","(10.0, 11.0]",2.0,"(9.0, 9.9]",2.0,"(48.6, 49.3]","(2.4, 2.8]","(2.5, 2.9]",...,,,0.01,"(0.2, 0.3]",,,,0.02,"(0.06, 0.2]",
2,"(19060.0, 36720.0]","(32.9, 66.9]","(2.6, 3.4]","(11.0, 12.0]",3.0,"(9.9, 10.6]",3.0,"(49.3, 49.9]","(2.8, 3.0]","(2.9, 3.2]",...,,,0.02,"(0.3, 0.6]",,,,0.029999999,"(0.2, 0.7]",
3,"(36720.0, 92676.6]","(66.9, 190.1]","(3.4, 4.2]","(12.0, 13.0]",4.0,"(10.6, 11.5]",4.0,"(49.9, 51.0]","(3.0, 3.4]","(3.2, 3.6]",...,,,0.029999999,"(0.6, 1.3]",,,,0.059999999,"(0.7, 3.2]",
4,"(92676.6, 10040682.0]","(190.1, 71907.4]","(4.2, 5.0]","(13.0, 25.0]",5.0,"(11.5, 24.5]",5.0,"(51.0, 70.9]","(3.4, 12.3]","(3.6, 10.1]",...,,,0.039999999,"(1.3, 18.6]",,,,0.079999998,"(3.2, 23.3]",


#### Import User "Responses" and "Importances":

In [7]:
# Import data:
user_df = pd.read_excel("../data/external/user_responses/RDPM_Quiz_modified.xlsx",sheet_name="Upload")
question_feature_ref_df = pd.read_csv("../data/external/question_feature_ref.csv")
user_ref_df = user_df.merge(question_feature_ref_df,on="Question",how="left")
user_ref_df["max_importance"] = pd.read_excel("../data/external/user_responses/RDPM_Quiz_modified.xlsx",sheet_name="Dropdowns")["Importance"].max()

# Fill NA values with 0:
user_ref_df[['Response', 'Importance']] = user_ref_df[['Response', 'Importance']].fillna(value=0)
user_ref_df[["max_response"]] = num_bins #Note this should change if more bins exist for a question

# Special case for religion question:
try:
    user_ref_df.loc[user_ref_df["Question"]=="Rel-2)","county_column_ref"] = user_ref_df.loc[user_ref_df["Question"]=="Rel-2)"]["Response"].values[0]
    user_ref_df.loc[user_ref_df["Question"]=="Rel-2)","Response"] = num_bins
except:
    user_ref_df = user_ref_df[user_ref_df["county_column_ref"].notna()]

# Assign 0's to NA values in relevant religion columns in county
user_ref_df.head()

Unnamed: 0,Question,Measurement,Response,Importance,ILOC,county_column_ref,max_importance,max_response
0,Acs-1),0,0,0.0,0,Population Density (Per Sq. Mile),10.0,5
1,Acs-2),At least,2,10.0,1,% Children Under 10,10.0,5
2,Acs-3),0,5,10.0,2,% Children 10 and Older,10.0,5
3,Acs-4),Equal to,4,10.0,3,% Couples that are Same-Sex,10.0,5
4,Acs-5),0,1,10.0,4,% Population Over 25 with at Least a Bachelor ...,10.0,5


In [8]:
# For columns where the user selects "at_most..." or "at_least...", assign similar values for any counties matching the criteria:
at_least_cols = user_ref_df[user_df["Measurement"]=="At least"]["county_column_ref"].to_list()
at_least_vals = user_ref_df[user_df["Measurement"]=="At least"]["Response"].to_list()

for col_num in range(len(at_least_cols)):
    col_label = at_least_cols[col_num]
    county_df.loc[county_df[col_label]>=at_least_vals[col_num],col_label] = county_df[col_label].max()
    
at_most_cols = user_ref_df[user_df["Measurement"]=="At most"]["county_column_ref"].to_list()
at_most_vals = user_ref_df[user_df["Measurement"]=="At most"]["Response"].to_list()

for col_num in range(len(at_most_cols)):
    col_label = at_most_cols[col_num]
    county_df.loc[county_df[col_label]<=at_least_vals[col_num],col_label] = county_df[col_label].min()

In [9]:
# Relevant columns:
feature_cols = user_ref_df["county_column_ref"].to_list()

#### Find Closest Matching Counties:

In [10]:
n_neighbors=50 # number of neighbors to match
matched_counties= cm.county_matcher(county_df,user_ref_df,feature_cols,target_col="FIPS",n_neighbors = n_neighbors,metric="euclidean")

In [11]:
matched_counties

Unnamed: 0,FIPS,County,State,distance_metric
0,13077,Coweta County,Georgia,8.792895
1,28059,Jackson County,Mississippi,8.794316
2,45063,Lexington County,South Carolina,8.80568
3,48203,Harrison County,Texas,8.807099
4,12091,Okaloosa County,Florida,8.807099
5,12019,Clay County,Florida,8.814193
6,48423,Smith County,Texas,8.81632
7,13157,Jackson County,Georgia,8.819864
8,44003,Kent County,Rhode Island,8.822698
9,25005,Bristol County,Massachusetts,8.822698
