## Step 1: Read in local data

In [92]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

hhi_df = pd.read_csv('/Users/jonzimmerman/Desktop/Data Projects/House Hunters International/data/data_w_lat_lon_v2.csv',encoding='latin-1')
hhi_df.head(2)

Unnamed: 0,index,ep_summary,air_date,ep_nums,ep_title,episode,season,year,MoveFromCity,MoveFromCountry,...,Origin,Destination,GeoCategory,lat_orig,lon_orig,lat_dest,lon_dest,distance_km,Skip,InputDataKey
0,1.0,"After attending college in the United States, ...",20-Feb-06,S01E01,Planting New Costa Rican Roots,1.0,1.0,2006.0,,United States,...,,"Escazu, Costa Rica",Missing Origin City,,,,,,Can not get data,0
1,2.0,Real estate agent Michelle Owens adores her li...,17-Apr-06,S01E02,Belize Bound,2.0,1.0,2006.0,"Charleston, South Carolina",United States,...,"Charleston, South Carolina",,Missing Destination City,,,,,,Can not get data,0


## Step 2: Create key for rows of data that need to be finished

In [93]:
#Create key
hhi_df['InputDataKey'] = np.where(
    (hhi_df['GeoCategory']=="All") & (hhi_df['Skip']=="Can get data") &
    (hhi_df['lat_orig'].isnull())  & (hhi_df['lon_orig'].isnull()) &
    (hhi_df['lat_dest'].isnull())  & (hhi_df['lon_dest'].isnull())

    ,1,0)

#Break out datasets
hhi_df1 = hhi_df[hhi_df['InputDataKey']==1]
hhi_df2 = hhi_df[hhi_df['InputDataKey']==0]

#Print # rows per dataset
print('# of rows to be imputed: ',hhi_df1.shape[0])
print('# of rows left alone: ',hhi_df2.shape[0])

# of rows to be imputed:  44
# of rows left alone:  2356


## Step 3: Impute coordinates and distances

In [94]:
#Impute latitude and longitude coordinates
from geopy.geocoders import Photon
geolocator = Photon(user_agent="measurements",timeout=None)

def extract_coordinates(location):
    geocode_result = geolocator.geocode(location, language='en')
    if geocode_result is not None:
        latitude = geocode_result.latitude
        longitude = geocode_result.longitude
        return latitude, longitude
    else:
        return None, None

hhi_df1[['lat_orig', 'lon_orig']] = hhi_df1['Origin'].apply(extract_coordinates).apply(pd.Series)
hhi_df1[['lat_dest', 'lon_dest']] = hhi_df1['Destination'].apply(extract_coordinates).apply(pd.Series)

#Impute distance between coordinates
import h3
hhi_df1['distance_km'] = hhi_df1.apply(lambda row: h3.point_dist(
    (row['lat_orig'], row['lon_orig']), 
    (row['lat_dest'], row['lon_dest'])
), axis=1)

hhi_df1.head()

Unnamed: 0,index,ep_summary,air_date,ep_nums,ep_title,episode,season,year,MoveFromCity,MoveFromCountry,...,Origin,Destination,GeoCategory,lat_orig,lon_orig,lat_dest,lon_dest,distance_km,Skip,InputDataKey
1444,1445.0,A Minnesota couple is excited to find new adve...,24-Nov-17,S115E02,From Minnesota to East Africa,2.0,115.0,2017.0,"Stacy, Minnesota",United States,...,"Stacy, Minnesota","Nairobi, Kenya",All,45.398021,-92.98744,-1.283253,36.817245,13091.295013,Can get data,1
1446,1447.0,"After five years of traveling the globe, a yog...",1-Dec-17,S115E04,Peace in Porsgrunn,4.0,115.0,2017.0,"Orange County, California",United States,...,"Orange County, California","Porsgrunn, Norway",All,33.750038,-117.870493,59.103735,9.736354,8619.218185,Can get data,1
1447,1448.0,A Pennsylvania native jumps at the opportunity...,11-Jan-18,S115E05,Brave New Warsaw,5.0,115.0,2018.0,"Phoenixville, Pennsylvania",United States,...,"Phoenixville, Pennsylvania","Warsaw, Poland",All,40.130382,-75.514913,52.233717,21.071432,6992.338493,Can get data,1
1449,1450.0,After a family sailing adventure in the Caribb...,20-Nov-17,S115E07,Oceanside in Manta,7.0,115.0,2017.0,"Heber City, Utah",United States,...,"Heber City, Utah","Manta, Ecuador",All,40.506463,-111.413296,-0.951761,-80.713859,5558.832234,Can get data,1
1450,1451.0,A couple wants to buy their forever home befor...,7-Dec-17,S115E08,Lands End Paradise,8.0,115.0,2017.0,"Little Rock, Arkansas",United States,...,"Little Rock, Arkansas","Cabo San Lucas, Mexico",All,34.746507,-92.289627,22.893888,-109.92006,2159.057737,Can get data,1


## Step 4: Recombine dataframes

In [95]:
full_df = pd.concat([hhi_df1,hhi_df2]).sort_values(by='index')
print(full_df.shape)
full_df.head(2)

(2400, 23)


Unnamed: 0,index,ep_summary,air_date,ep_nums,ep_title,episode,season,year,MoveFromCity,MoveFromCountry,...,Origin,Destination,GeoCategory,lat_orig,lon_orig,lat_dest,lon_dest,distance_km,Skip,InputDataKey
0,1.0,"After attending college in the United States, ...",20-Feb-06,S01E01,Planting New Costa Rican Roots,1.0,1.0,2006.0,,United States,...,,"Escazu, Costa Rica",Missing Origin City,,,,,,Can not get data,0
1,2.0,Real estate agent Michelle Owens adores her li...,17-Apr-06,S01E02,Belize Bound,2.0,1.0,2006.0,"Charleston, South Carolina",United States,...,"Charleston, South Carolina",,Missing Destination City,,,,,,Can not get data,0


## Step 5: Output results

In [96]:
full_df.to_csv('/Users/jonzimmerman/Desktop/Data Projects/House Hunters International/data/data_w_lat_lon_v2.csv', encoding='utf-8', index=False)