## Step 1: Read in local data

In [189]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

hhi_df = pd.read_csv('/Users/jonzimmerman/Desktop/Data Projects/House Hunters International/data/data_w_lat_lon_v2.csv',encoding='latin-1')
hhi_df.head(2)

Unnamed: 0,index,ep_summary,air_date,ep_nums,ep_title,episode,season,year,MoveFromCity,MoveFromCountry,...,Origin,Destination,GeoCategory,lat_orig,lon_orig,lat_dest,lon_dest,distance_km,Skip,InputDataKey
0,1.0,"After attending college in the United States, ...",20-Feb-06,S01E01,Planting New Costa Rican Roots,1.0,1.0,2006.0,,United States,...,,"Escazu, Costa Rica",Missing Origin City,,,,,,Can not get data,0
1,2.0,Real estate agent Michelle Owens adores her li...,17-Apr-06,S01E02,Belize Bound,2.0,1.0,2006.0,"Charleston, South Carolina",United States,...,"Charleston, South Carolina",,Missing Destination City,,,,,,Can not get data,0


## Step 2: Create key for rows of data that need to be finished

In [190]:
#Create key
hhi_df['InputDataKey'] = np.where(
    (hhi_df['GeoCategory']=="All") & (hhi_df['Skip']=="Can get data") &
    (hhi_df['lat_orig'].isnull())  & (hhi_df['lon_orig'].isnull()) &
    (hhi_df['lat_dest'].isnull())  & (hhi_df['lon_dest'].isnull())

    ,1,0)

#Break out datasets
hhi_df1 = hhi_df[hhi_df['InputDataKey']==1]
hhi_df2 = hhi_df[hhi_df['InputDataKey']==0]

#Print # rows per dataset
print('# of rows to be imputed: ',hhi_df1.shape[0])
print('# of rows left alone: ',hhi_df2.shape[0])

# of rows to be imputed:  9
# of rows left alone:  2392


## Step 3: Impute coordinates and distances

In [191]:
#Impute latitude and longitude coordinates
from geopy.geocoders import Photon
geolocator = Photon(user_agent="measurements",timeout=None)

def extract_coordinates(location):
    geocode_result = geolocator.geocode(location, language='en')
    if geocode_result is not None:
        latitude = geocode_result.latitude
        longitude = geocode_result.longitude
        return latitude, longitude
    else:
        return None, None

hhi_df1[['lat_orig', 'lon_orig']] = hhi_df1['Origin'].apply(extract_coordinates).apply(pd.Series)
hhi_df1[['lat_dest', 'lon_dest']] = hhi_df1['Destination'].apply(extract_coordinates).apply(pd.Series)

#Impute distance between coordinates
import h3
hhi_df1['distance_km'] = hhi_df1.apply(lambda row: h3.point_dist(
    (row['lat_orig'], row['lon_orig']), 
    (row['lat_dest'], row['lon_dest'])
), axis=1)

hhi_df1.head()

Unnamed: 0,index,ep_summary,air_date,ep_nums,ep_title,episode,season,year,MoveFromCity,MoveFromCountry,...,Origin,Destination,GeoCategory,lat_orig,lon_orig,lat_dest,lon_dest,distance_km,Skip,InputDataKey
2270,2271.0,A Canadian travel writer and an Aussie tour gu...,3-Nov-22,S178E09,First-Timers Shack Up Down Under,9.0,178.0,2022.0,Christchurch,New Zealand,...,"Christchurch, New Zealand","Mapleton, Australia",All,-43.530955,172.636645,-26.62414,152.866157,2588.768691,Can get data,1
2276,2277.0,A budding romance leads a young man to leave h...,8-Nov-22,S179E02,"Rushing Toward Romance in Phnom Penh, Cambodia",2.0,179.0,2022.0,Rohrbach,Germany,...,"Rohrbach, Germany","Phnom Penh, Cambodia",All,49.583545,7.257772,11.568271,104.922443,9574.231679,Can get data,1
2278,2279.0,"After two babies and a health scare, a family ...",15-Nov-22,S179E04,"A Castle to Call Home in Fife, Scotland",4.0,179.0,2022.0,"San Francisco, California",United States,...,"San Francisco, California","Fife, Scotland",All,37.779026,-122.419906,56.333333,-3.0,8102.10148,Can get data,1
2279,2280.0,A couple looks to turn loss into inspiration a...,22-Nov-22,S179E05,Turning Tragedy into Inspiration in Mexico City,5.0,179.0,2022.0,"Houston, Texas",United States,...,"Houston, Texas","Mexico City, Mexico",All,29.758938,-95.367697,19.43263,-99.133178,1209.435995,Can get data,1
2280,2281.0,Recent college grads are moving from Oregon to...,4-Nov-22,S179E06,De Nada Granada,6.0,179.0,2022.0,"Portland, Oregon",United States,...,"Portland, Oregon","Granada, Spain",All,45.520247,-122.674194,37.173499,-3.599534,8984.97534,Can get data,1


## Step 4: Recombine dataframes

In [192]:
full_df = pd.concat([hhi_df1,hhi_df2]).sort_values(by='index')
print(full_df.shape)
full_df.head(2)

(2401, 23)


Unnamed: 0,index,ep_summary,air_date,ep_nums,ep_title,episode,season,year,MoveFromCity,MoveFromCountry,...,Origin,Destination,GeoCategory,lat_orig,lon_orig,lat_dest,lon_dest,distance_km,Skip,InputDataKey
0,1.0,"After attending college in the United States, ...",20-Feb-06,S01E01,Planting New Costa Rican Roots,1.0,1.0,2006.0,,United States,...,,"Escazu, Costa Rica",Missing Origin City,,,,,,Can not get data,0
1,2.0,Real estate agent Michelle Owens adores her li...,17-Apr-06,S01E02,Belize Bound,2.0,1.0,2006.0,"Charleston, South Carolina",United States,...,"Charleston, South Carolina",,Missing Destination City,,,,,,Can not get data,0


## Step 5: Output results

In [194]:
full_df.to_csv('/Users/jonzimmerman/Desktop/Data Projects/House Hunters International/data/data_w_lat_lon_v2.csv', encoding='utf-8', index=False)
#full_df.to_csv('/Users/jonzimmerman/Desktop/Data Projects/House Hunters International/data/data_w_lat_lon_v3.csv', encoding='utf-8', index=False)