# Relocation Destination Personalization Machination Recommendation

In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import altair as alt
import sys
sys.path.insert(1, '../utils/')
import rdpm_recommender_condensed as rdpm
import warnings
warnings.filterwarnings('ignore')


In [2]:
# import Quiz Template:
quiz = pd.read_excel("../data/external/user_responses/RDPM Questionnaire.xlsx", sheet_name="Upload")
quiz_answers=pd.read_excel("../data/external/user_responses/RDPM Questionnaire.xlsx", sheet_name="Dropdowns")

In [3]:
# get county data
github_url = "https://raw.githubusercontent.com/rabader/relocation-personalization/main/data/"

FIPS_d = pd.read_csv("../data/processed/FIPS_ref.csv").set_index('FIPS')
rel_d = pd.read_csv("../data/processed/Religion_dataset.csv").set_index('FIPS')
sd_d = pd.read_csv("../data/interim/School_2018_imputed.csv")
ter_d = pd.read_csv("../data/processed/Terrain_dataset.csv").set_index('FIPS')
wth_d = pd.read_csv("../data/processed/Weather_etc_State_dataset.csv")
acs_d = pd.read_csv("../data/interim/ACS_2020_imputed.csv").set_index('FIPS')
hth_d = pd.read_csv("../data/interim/health_imputed.csv").set_index('FIPS')
fbi_d = pd.read_csv("../data/interim/fbi_imputed.csv").set_index('FIPS')
pol_d = pd.read_csv("../data/processed/Politics.csv").set_index('FIPS')
tax_d = pd.read_csv("../data/processed/Taxes.csv")

In [4]:
n_loops = 1000
FIPS_d_out = pd.DataFrame()

for i in tqdm(range(n_loops)):
    try:
        randomized_quiz = rdpm.random_quiz_generator(quiz,quiz_answers)
        FIPS_d_out_temp = rdpm.rdpm_recommender_condensed(randomized_quiz,FIPS_d, rel_d, sd_d, ter_d, wth_d, acs_d, hth_d, fbi_d, pol_d, tax_d)
        FIPS_d_out_temp = FIPS_d_out_temp.rename_axis('rank').reset_index()
        FIPS_d_out_temp["rank"] = FIPS_d_out_temp["rank"] + 1 #top rank = 1 instead of 0
        FIPS_d_out = pd.concat([FIPS_d_out,FIPS_d_out_temp])
    except:
        pass

100%|██████████| 1000/1000 [56:34<00:00,  3.39s/it]


In [5]:
# # # Save results to CSV/Parquet:
FIPS_d_out.to_parquet("../data/interim/results_from_random_quizzes.parquet.gzip",index=False)

In [6]:
# Read in results (start here to skip randomized quiz processing, which can take a long time)
FIPS_d_out = pd.read_parquet("../data/interim/results_from_random_quizzes.parquet.gzip")

In [7]:
top_50 = FIPS_d_out[FIPS_d_out["rank"]<=50].groupby(["County","State","rank"]).count().sort_values(["rank","Scores"],ascending=[True,False]).reset_index().rename(columns={"Scores":"Count"})
top_20 = FIPS_d_out[FIPS_d_out["rank"]<=20].groupby(["County","State","rank"]).count().sort_values(["rank","Scores"],ascending=[True,False]).reset_index().rename(columns={"Scores":"Count"})
top_10 = FIPS_d_out[FIPS_d_out["rank"]<=10].groupby(["County","State","rank"]).count().sort_values(["rank","Scores"],ascending=[True,False]).reset_index().rename(columns={"Scores":"Count"})

In [15]:
alt.data_transformers.disable_max_rows()
circles = alt.Chart(top_10,title="Distinct Counties and States in Top 10, from 1000 Random Quizzes").mark_circle().encode(
    x=alt.X('distinct(State)',title="# of Distinct States",bin=False, scale=alt.Scale(domain=[47, 50]), axis=alt.Axis(tickMinStep=1)),
    y=alt.Y('distinct(County)',title="# of Distinct Counties", scale = alt.Scale(domain=[500,600])),
    color=alt.Color("rank:N",scale=alt.Scale(scheme="turbo"),title="Rank"),
).properties(width=800,height=300)

text = circles.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='rank'
)

circles + text