In [1]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import numpy as np
import pickle

# import visualization tools
import umap
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import spacy

from sklearn.metrics.pairwise import cosine_similarity

In [26]:
# import pickled data
df_lda_cv = pd.read_pickle('df_lda_cv.pkl')
data_na = pd.read_pickle('data_na.pkl')
trail_df = pd.read_pickle('trail_agg_df.pickle')
doc_topic_lda_cv = pd.read_pickle('doc_topic_lda_cv.pkl')

In [4]:
# create dataframe from doc_topic array
a_df = pd.DataFrame(doc_topic_lda_cv,
             index = data_na.index,
             columns = ["topic_1","topic_2","topic_3","topic_4",
                       "topic_5","topic_6"])

In [7]:
trail_df

Unnamed: 0,trail_name,elevation_gain,hike_attributes,hike_difficulty,hike_region,num_reviews,route_type,stars,total_distance
0,Tin Mine Canyon Trail,715 feet,"[dogs on leash, hiking, walking, partially pav...",MODERATE,Cleveland National Forest,276,Out & Back,4.2,4.5 miles
1,Fern Canyon Scenic Trail,826 feet,"[camping, hiking, nature trips, walking, bird ...",MODERATE,Van Damme Beach State Park,144,Loop,4.5,8.3 miles
2,Folsom Trail,265 feet,"[dogs on leash, kid friendly, hiking, mountain...",MODERATE,Folsom Lake State Recreation Area,121,Out & Back,4.1,9.2 miles
3,Santa Paula Canyon and Punch Bowls Trail,787 feet,"[dogs on leash, camping, hiking, horseback rid...",MODERATE,Los Padres National Forest,631,Out & Back,4.4,7.0 miles
4,Schabarum Trail,305 m,"[dogs on leash, hiking, mountain biking, natur...",MODERATE,Walnut Ranch Park,196,Loop,4.3,5.8 km
...,...,...,...,...,...,...,...,...,...
1020,Griffith Park Old Zoo Loop,419 feet,"[hiking, nature trips, walking, bird watching,...",easy,Griffith Park,193,Loop,4.2,2.7 miles
1021,Brown Mountain Dam Waterfall,656 feet,"[dogs on leash, backpacking, hiking, mountain ...",moderate,Angeles National Forest,123,Out & Back,4.5,8.4 miles
1022,San Diego Sea to Sea Trail via Camino Ruiz Park,403 feet,"[dogs on leash, hiking, mountain biking, bird ...",easy,Los Penasquitos Canyon Preserve,142,Out & Back,4.2,4.8 miles
1023,Partington Cove Trail,347 feet,"[hiking, nature trips, beach, river, views, wa...",moderate,"Big Sur, California",205,Out & Back,4.6,1.1 miles


In [8]:
# functions to clean trail_df data

def elevation(row):
    '''converts elevation in meters to feet'''
    row = row.replace(',','').strip().replace(' m','')
    row = int(row)
    row = row * 3
    return row

def distance(row):
    '''converts distance in km to miles'''
    row = row.strip().replace(',','').replace(' km','')
    row = float(row)
    row = round((row*0.621371),1)
    return row

In [9]:
trail_df['hike_difficulty'] = trail_df['hike_difficulty'].str.lower()

In [10]:
trail_df['difficulty'] = trail_df['hike_difficulty'].map({'easy': 1, 'moderate': 2,'hard':3})

In [11]:
trail_df['distance'] = trail_df['total_distance'].str.replace(' miles','').str.strip().str.replace(',','')

In [12]:
trail_df.distance = trail_df.distance.apply(lambda x: distance(x) if 'km' in x else x)

In [13]:
trail_df['elevation'] = trail_df['elevation_gain'].str.strip().str.replace(' feet','').str.replace(',','')

In [14]:
trail_df.elevation = trail_df.elevation.apply(lambda x: elevation(x) if 'm' in x else x)

In [15]:
trail_sm = trail_df[['trail_name','distance','difficulty','elevation']]

In [16]:
trail_sm = trail_sm.set_index('trail_name')

##  Scale new features

In [17]:
from sklearn import preprocessing
# convert dataframe to array for scaling
X_train = trail_sm.to_numpy(dtype='float')

In [18]:
min_max_scaler = preprocessing.MinMaxScaler()

In [19]:
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_train_minmax

array([[1.66044002e-03, 5.00000000e-01, 1.65647269e-03],
       [3.09445639e-03, 5.00000000e-01, 1.91471492e-03],
       [3.43409185e-03, 5.00000000e-01, 6.09544726e-04],
       ...,
       [1.77365184e-03, 0.00000000e+00, 9.30602635e-04],
       [3.77372731e-04, 5.00000000e-01, 8.00318266e-04],
       [5.35869278e-03, 1.00000000e+00, 6.93764264e-03]])

In [20]:
# convert scaled array back to dataframe
trail_s = pd.DataFrame(X_train_minmax,
             index = trail_sm.index,
             columns = ['distance','difficulty','elevation'])

In [21]:
trail_s

Unnamed: 0_level_0,distance,difficulty,elevation
trail_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tin Mine Canyon Trail,0.001660,0.5,0.001656
Fern Canyon Scenic Trail,0.003094,0.5,0.001915
Folsom Trail,0.003434,0.5,0.000610
Santa Paula Canyon and Punch Bowls Trail,0.002604,0.5,0.001824
Schabarum Trail,0.001321,0.5,0.002122
...,...,...,...
Griffith Park Old Zoo Loop,0.000981,0.0,0.000968
Brown Mountain Dam Waterfall,0.003132,0.5,0.001519
San Diego Sea to Sea Trail via Camino Ruiz Park,0.001774,0.0,0.000931
Partington Cove Trail,0.000377,0.5,0.000800


### Merge new features with doc topic df

In [23]:
trail_cosine_df = a_df.join(trail_s,how='left')

In [24]:
trail_cosine_df.index = trail_cosine_df.index.str.strip()

In [27]:
df_lda_cv['topic'] = df_lda_cv.idxmax(axis=1)

In [43]:
df_lda_cv['topic'] = df_lda_cv['topic'].str.replace('topic_','')

In [28]:
df_lda_cv

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic
San Clemente State Beach,0.016732,0.001399,0.376388,0.001399,0.602682,0.001400,topic_5
Sostomo and Deer Valley Loop Trail,0.000254,0.733161,0.043761,0.000254,0.171032,0.051538,topic_2
Abalone Cove Trail,0.000222,0.000222,0.392668,0.000222,0.163983,0.442682,topic_6
Abeja Loop Trail,0.013848,0.750046,0.006227,0.000389,0.229101,0.000389,topic_2
Ahwingna and Native Oak Loop Trail,0.000128,0.524869,0.000128,0.010950,0.462106,0.001820,topic_2
...,...,...,...,...,...,...,...
Wrights Lake to Grouse Lake,0.679103,0.031326,0.022949,0.088833,0.000425,0.177364,topic_1
Zabriskie Point,0.000385,0.159699,0.777425,0.000386,0.061720,0.000386,topic_3
Zanja Peak Trail,0.000275,0.872422,0.000272,0.000274,0.126484,0.000273,topic_2
Zumwalt Meadow And Roaring River Falls,0.297922,0.000359,0.061063,0.014261,0.214505,0.411890,topic_6


In [29]:
trail_cosine_df = trail_cosine_df.loc[~trail_cosine_df.index.duplicated(keep='first')]

In [30]:
trail_cosine_df.loc['Mount Whitney Trail']

topic_1       0.052074
topic_2       0.000012
topic_3       0.000012
topic_4       0.943545
topic_5       0.004345
topic_6       0.000012
distance      0.008453
difficulty    1.000000
elevation     0.015478
Name: Mount Whitney Trail, dtype: float64

## Recommender

In [31]:
doc_topic2 = trail_cosine_df.to_numpy(dtype='float')

In [372]:
cosine_sim_lda = cosine_similarity(doc_topic_lda_cv)

In [34]:
def recommend(trail, num_recommends, cosine_sim = trail_cosine_df, df = trail_cosine_df):
    '''input: trail name, number of trails to return, cosine similarity matrix, dataframe to use as index
       output: recommended trails, cosine similarity score, topic'''
    indices = pd.Series(list(df.index))
    recommended_trails = []
    idx = indices[indices == trail].index[0]
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    top_n_indices = list(zip(score_series.iloc[1:num_recommends].index, 
                             score_series.iloc[1:num_recommends].values))
    
    for i, score in top_n_indices:
        topic = df_lda_cv.iloc[i]['topic']
        recommended_trails.append((list(indices)[i], score, topic))
        
    return recommended_trails

In [373]:
# CV - LDA
# order without the added features from trail_df
recommend("Mount Whitney Trail",10,cosine_sim=cosine_sim_lda, df=data_na)

[('Clouds Rest Trail from Yosemite Valley', 0.9974778213888202),
 ('Mount Muir via the Mount Whitney Trail', 0.9961405143895997),
 ('Half Dome Trail', 0.9954829403553808),
 ('San Jacinto Peak from The Tramway', 0.9936451551696017),
 ('Mount Shasta via Avalanche Gulch Route', 0.9924709964694631),
 ('Mount San Antonio via Baldy Bowl Trail', 0.991771347725399),
 ('Vivian Creek Trail to San Gorgonio Peak', 0.991596806534552),
 ('Skyline Trail to Aerial Tramway Top Station', 0.9895521092768316),
 ('San Jacinto Peak Middle Route Trail', 0.989002597111029)]

In [32]:
cosine_sim_more = cosine_similarity(trail_cosine_df,trail_cosine_df)

In [45]:
# CV - LDA with new features (elevation,distance,difficulty)
recommend("Mount Whitney Trail", 10, cosine_sim=cosine_sim_more, df=trail_cosine_df)

[('Clouds Rest Trail from Yosemite Valley', 0.9984544542112237, '4'),
 ('Mount Muir via the Mount Whitney Trail', 0.9982548279292811, '4'),
 ('Vivian Creek Trail to San Gorgonio Peak', 0.997485354725001, '4'),
 ('Half Dome Trail', 0.9967095300992956, '4'),
 ('Skyline Trail to Aerial Tramway Top Station', 0.9961144613913573, '4'),
 ('San Jacinto Peak from The Tramway', 0.9960818978739187, '4'),
 ('Mount San Antonio via Baldy Bowl Trail', 0.9937551256418606, '4'),
 ('San Jacinto Peak Middle Route Trail', 0.9931565724694243, '4'),
 ('Mount Shasta via Avalanche Gulch Route', 0.9928907019637475, '4')]

In [37]:
recommend("Tanbark Trail and Tin House", 10, cosine_sim=cosine_sim_more, df=trail_cosine_df)

[('Soberanes Canyon Trail', 0.9993926531834374, 'topic_2'),
 ('Point Sal Overlook', 0.9983932861699566, 'topic_2'),
 ('Juniper to Eagle Peak to Summit Loop', 0.99609200547144, 'topic_2'),
 ('Mori Point and Sweeney Ridge Loop Trail via Skyline College',
  0.9949549008893087,
  'topic_2'),
 ('Black Rock Canyon Trail to Warren Peak', 0.9935367730147641, 'topic_2'),
 ('Whittemore Gulch and Harkins Ridge Loop Trail',
  0.9914195587043442,
  'topic_2'),
 ('Condor Gulch Trail to High Peaks Trail Loop',
  0.9899806240659736,
  'topic_2'),
 ('Tennesse Valley to Muir Beach and Fox Trail',
  0.9876294430189846,
  'topic_2'),
 ('Rodeo Valley Trail To Miwok Trail Loop Via Coastal Trail',
  0.9856947839475881,
  'topic_2')]