## Step 1: Read in local data

In [76]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

hhi_df = pd.read_csv('/Users/jonzimmerman/Desktop/Data Projects/House Hunters International/data/data_w_lat_lon_v2.csv',encoding='latin-1')
hhi_df.head(2)

Unnamed: 0,index,ep_summary,air_date,ep_nums,ep_title,episode,season,year,MoveFromCity,MoveFromCountry,...,Origin,Destination,GeoCategory,lat_orig,lon_orig,lat_dest,lon_dest,distance_km,Skip,InputDataKey
0,1.0,"After attending college in the United States, ...",20-Feb-06,S01E01,Planting New Costa Rican Roots,1.0,1.0,2006.0,,United States,...,,"Escazu, Costa Rica",Missing Origin City,,,,,,Can not get data,0
1,2.0,Real estate agent Michelle Owens adores her li...,17-Apr-06,S01E02,Belize Bound,2.0,1.0,2006.0,"Charleston, South Carolina",United States,...,"Charleston, South Carolina",,Missing Destination City,,,,,,Can not get data,0


## Step 2: Create key for rows of data that need to be finished

In [77]:
#Create key
hhi_df['InputDataKey'] = np.where(
    (hhi_df['GeoCategory']=="All") & (hhi_df['Skip']=="Can get data") &
    (hhi_df['lat_orig'].isnull())  & (hhi_df['lon_orig'].isnull()) &
    (hhi_df['lat_dest'].isnull())  & (hhi_df['lon_dest'].isnull())

    ,1,0)

#Break out datasets
hhi_df1 = hhi_df[hhi_df['InputDataKey']==1]
hhi_df2 = hhi_df[hhi_df['InputDataKey']==0]

#Print # rows per dataset
print('# of rows to be imputed: ',hhi_df1.shape[0])
print('# of rows left alone: ',hhi_df2.shape[0])

# of rows to be imputed:  14
# of rows left alone:  2386


## Step 3: Impute coordinates and distances

In [78]:
#Impute latitude and longitude coordinates
from geopy.geocoders import Photon
geolocator = Photon(user_agent="measurements",timeout=None)

def extract_coordinates(location):
    geocode_result = geolocator.geocode(location, language='en')
    if geocode_result is not None:
        latitude = geocode_result.latitude
        longitude = geocode_result.longitude
        return latitude, longitude
    else:
        return None, None

hhi_df1[['lat_orig', 'lon_orig']] = hhi_df1['Origin'].apply(extract_coordinates).apply(pd.Series)
hhi_df1[['lat_dest', 'lon_dest']] = hhi_df1['Destination'].apply(extract_coordinates).apply(pd.Series)

#Impute distance between coordinates
import h3
hhi_df1['distance_km'] = hhi_df1.apply(lambda row: h3.point_dist(
    (row['lat_orig'], row['lon_orig']), 
    (row['lat_dest'], row['lon_dest'])
), axis=1)

hhi_df1.head()

Unnamed: 0,index,ep_summary,air_date,ep_nums,ep_title,episode,season,year,MoveFromCity,MoveFromCountry,...,Origin,Destination,GeoCategory,lat_orig,lon_orig,lat_dest,lon_dest,distance_km,Skip,InputDataKey
1703,1704.0,After falling in love with Poland and its cult...,27-Feb-19,S135E01,"Falling in Love with Wroclaw, Poland",1.0,135.0,2019.0,"Atlanta, Georgia",United States,...,"Atlanta, Georgia","Wroclaw, Poland",All,33.748992,-84.390264,51.108978,17.032669,7871.556517,Can get data,1
1706,1707.0,An energetic dance instructor has decided to l...,5-Feb-19,S135E04,A Taste of a New Life in Prague,4.0,135.0,2019.0,"Cleveland, Ohio",United States,...,"Cleveland, Ohio","Prague, Czech Republic",All,41.499657,-81.693677,50.086423,14.415677,6983.098955,Can get data,1
1707,1708.0,A craving for adventure brings a family of thr...,21-Feb-19,S135E05,A Family Adventure in Budapest,5.0,135.0,2019.0,"San Francisco, California",United States,...,"San Francisco, California","Budapest, Hungary",All,37.779026,-122.419906,47.48139,19.146094,9796.555434,Can get data,1
1722,1723.0,A Florida couple looks to relocate internation...,19-Feb-19,S136E06,"Wine Not Move to Queretaro, Mexico",6.0,136.0,2019.0,"Winter Park, Florida",United States,...,"Winter Park, Florida","Queretaro, Mexico",All,28.597771,-81.351026,20.592774,-100.390225,2117.089246,Can get data,1
1724,1725.0,A couple's dream of living abroad comes true w...,28-Feb-19,S136E08,Beach or Bust,8.0,136.0,2019.0,"Denver, Colorado",United States,...,"Denver, Colorado","Melbourne, Australia",All,39.739236,-104.984862,-37.814245,144.963173,14109.235734,Can get data,1


## Step 4: Recombine dataframes

In [79]:
full_df = pd.concat([hhi_df1,hhi_df2]).sort_values(by='index')
print(full_df.shape)
full_df.head(2)

(2400, 23)


Unnamed: 0,index,ep_summary,air_date,ep_nums,ep_title,episode,season,year,MoveFromCity,MoveFromCountry,...,Origin,Destination,GeoCategory,lat_orig,lon_orig,lat_dest,lon_dest,distance_km,Skip,InputDataKey
0,1.0,"After attending college in the United States, ...",20-Feb-06,S01E01,Planting New Costa Rican Roots,1.0,1.0,2006.0,,United States,...,,"Escazu, Costa Rica",Missing Origin City,,,,,,Can not get data,0
1,2.0,Real estate agent Michelle Owens adores her li...,17-Apr-06,S01E02,Belize Bound,2.0,1.0,2006.0,"Charleston, South Carolina",United States,...,"Charleston, South Carolina",,Missing Destination City,,,,,,Can not get data,0


In [80]:
#full_df[full_df['Origin']=="Corvallis, Oregon"]

## Step 5: Output results

In [81]:
full_df.to_csv('/Users/jonzimmerman/Desktop/Data Projects/House Hunters International/data/data_w_lat_lon_v2.csv', encoding='utf-8', index=False)