In [1]:
import pandas as pd
import numpy as np

import warnings

with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)
    import nltk
    from nltk.tokenize import RegexpTokenizer
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.corpus import stopwords
    from nltk import FreqDist

    import gensim
    from gensim import corpora, models, similarities
    import pyLDAvis
    import pyLDAvis.gensim

  from collections import Sequence
  from collections import Mapping, Set, Iterable, Iterator, defaultdict
  from collections import Mapping, Set, Iterable, Iterator, defaultdict
  from collections import Hashable


In [2]:
cities_raw = pd.read_csv('cities_geo2_df.csv', index_col=0)
cities = cities_raw.loc[:,:'Get out'] # Just text portions of the df

# Reset index
cities = cities.reset_index()
cities.drop(['index'], axis=1, inplace=True)

In [3]:
cities.head()

Unnamed: 0,Country,City,Understand,Get in,Get around,See,Do,Buy,Eat,Sleep,Cope,Stay safe,Get out
0,Afghanistan,Herat,Herat is the second largest city in Afghanista...,The Herat International Airport is situated 15...,,"Ghala Ekhteyaradin, Takht Safar, Bagh Milat, ...",Take a shower in the huge subterranean Hammams.,There are several antique shops on the north ...,Herat has a mixture of traditional and modern ...,"Budget[edit] Mowafaq Hotel, northeast corner ...",,Herat is one of the safer cities in Afghanista...,Chisht-i-Sharif is some 177 km from Herat city...
1,Afghanistan,Kabul,Kabul is a very historic city of the region th...,By plane[edit] Kabul International Airport (IA...,Maps of Kabul are available from Afghanistan I...,Bagh-e Babur (Gardens of Babur). The gardens ...,Kabul Wall. A pleasant hike with rewarding vi...,The Share-e Naw area has some shops. The Kab...,The once thriving restaurant scene in Kabul ha...,"Kabul is not a cheap place to stay, principall...",Read the Scene magazine for restaurant reviews...,Kabul is generally considered one of the safer...,Most expats take any opportunity they can to l...
2,Afghanistan,Jalalabad,,Jalalabad is on the major highway that links K...,,The large reservoir near Sarobi. This is loca...,Go fishing along the river side. Go swimming .,Handi Craft Handicraft is one of most famous ...,Pakora go for special “Pakora” Food in Hindu ...,The government hotel Spingar is on the easter...,,,This article is an outline and needs more co...
3,Afghanistan,Kandahar,Kandahār or Qandahār (Pashto: کندهار ) (Persia...,By plane[edit] Kandahar International Airport ...,,"Kandahar Museum, (Western end of the Eidgah ...",,"Afghanistan International Bank (AIB), Kabul Ba...","There are plenty of food choices in Kandahar, ...","Budget[edit] Armani Hotel, (Two miles from c...",,"As of 2013, the city centre of Kandahar is qui...",Kabul Helmand Harat Zabul Spin Boldak (Spin Bu...
4,Afghanistan,Kunduz,,Most visitors will arrive by road from Pol-e K...,"The city is fairly small. The local roads, li...",There's not much for tourists here.,,,The usual Afghan fare - kebab or lamb with ric...,,,,This article is an outline and needs more ...


In [4]:
# Run once
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')

In [5]:
tokenizer = RegexpTokenizer(r'\w+') # retains alphanumeric character
lmtzr = WordNetLemmatizer()
cached_stop_words = stopwords.words("english")

In [6]:
def lemmatize_dropstop_words(text): 
    return [lmtzr.lemmatize(word) for word in text if lmtzr.lemmatize(word) not in cached_stop_words]

In [7]:
cities = cities.replace(np.nan, '', regex=True)

In [8]:
cities.head()

Unnamed: 0,Country,City,Understand,Get in,Get around,See,Do,Buy,Eat,Sleep,Cope,Stay safe,Get out
0,Afghanistan,Herat,Herat is the second largest city in Afghanista...,The Herat International Airport is situated 15...,,"Ghala Ekhteyaradin, Takht Safar, Bagh Milat, ...",Take a shower in the huge subterranean Hammams.,There are several antique shops on the north ...,Herat has a mixture of traditional and modern ...,"Budget[edit] Mowafaq Hotel, northeast corner ...",,Herat is one of the safer cities in Afghanista...,Chisht-i-Sharif is some 177 km from Herat city...
1,Afghanistan,Kabul,Kabul is a very historic city of the region th...,By plane[edit] Kabul International Airport (IA...,Maps of Kabul are available from Afghanistan I...,Bagh-e Babur (Gardens of Babur). The gardens ...,Kabul Wall. A pleasant hike with rewarding vi...,The Share-e Naw area has some shops. The Kab...,The once thriving restaurant scene in Kabul ha...,"Kabul is not a cheap place to stay, principall...",Read the Scene magazine for restaurant reviews...,Kabul is generally considered one of the safer...,Most expats take any opportunity they can to l...
2,Afghanistan,Jalalabad,,Jalalabad is on the major highway that links K...,,The large reservoir near Sarobi. This is loca...,Go fishing along the river side. Go swimming .,Handi Craft Handicraft is one of most famous ...,Pakora go for special “Pakora” Food in Hindu ...,The government hotel Spingar is on the easter...,,,This article is an outline and needs more co...
3,Afghanistan,Kandahar,Kandahār or Qandahār (Pashto: کندهار ) (Persia...,By plane[edit] Kandahar International Airport ...,,"Kandahar Museum, (Western end of the Eidgah ...",,"Afghanistan International Bank (AIB), Kabul Ba...","There are plenty of food choices in Kandahar, ...","Budget[edit] Armani Hotel, (Two miles from c...",,"As of 2013, the city centre of Kandahar is qui...",Kabul Helmand Harat Zabul Spin Boldak (Spin Bu...
4,Afghanistan,Kunduz,,Most visitors will arrive by road from Pol-e K...,"The city is fairly small. The local roads, li...",There's not much for tourists here.,,,The usual Afghan fare - kebab or lamb with ric...,,,,This article is an outline and needs more ...


In [9]:
# Tokenize, lemmatize
cities['Understand_tokens'] = cities['Understand'].str.lower().apply(tokenizer.tokenize).apply(lemmatize_dropstop_words)
cities['See_tokens'] = cities['See'].str.lower().apply(tokenizer.tokenize).apply(lemmatize_dropstop_words)
cities['Do_tokens'] = cities['Do'].str.lower().apply(tokenizer.tokenize).apply(lemmatize_dropstop_words)
cities['Buy_tokens'] = cities['Buy'].str.lower().apply(tokenizer.tokenize).apply(lemmatize_dropstop_words)
cities['Eat_tokens'] = cities['Eat'].str.lower().apply(tokenizer.tokenize).apply(lemmatize_dropstop_words)

In [10]:
cities.head()

Unnamed: 0,Country,City,Understand,Get in,Get around,See,Do,Buy,Eat,Sleep,Cope,Stay safe,Get out,Understand_tokens,See_tokens,Do_tokens,Buy_tokens,Eat_tokens
0,Afghanistan,Herat,Herat is the second largest city in Afghanista...,The Herat International Airport is situated 15...,,"Ghala Ekhteyaradin, Takht Safar, Bagh Milat, ...",Take a shower in the huge subterranean Hammams.,There are several antique shops on the north ...,Herat has a mixture of traditional and modern ...,"Budget[edit] Mowafaq Hotel, northeast corner ...",,Herat is one of the safer cities in Afghanista...,Chisht-i-Sharif is some 177 km from Herat city...,"[herat, second, largest, city, afghanistan, lo...","[ghala, ekhteyaradin, takht, safar, bagh, mila...","[take, shower, huge, subterranean, hammams]","[several, antique, shop, north, side, mosque, ...","[herat, ha, mixture, traditional, modern, food..."
1,Afghanistan,Kabul,Kabul is a very historic city of the region th...,By plane[edit] Kabul International Airport (IA...,Maps of Kabul are available from Afghanistan I...,Bagh-e Babur (Gardens of Babur). The gardens ...,Kabul Wall. A pleasant hike with rewarding vi...,The Share-e Naw area has some shops. The Kab...,The once thriving restaurant scene in Kabul ha...,"Kabul is not a cheap place to stay, principall...",Read the Scene magazine for restaurant reviews...,Kabul is generally considered one of the safer...,Most expats take any opportunity they can to l...,"[kabul, historic, city, region, wa, built, alm...","[bagh, e, babur, garden, babur, garden, surrou...","[kabul, wall, pleasant, hike, rewarding, view,...","[share, e, naw, area, ha, shop, kabul, city, c...","[thriving, restaurant, scene, kabul, ha, suffe..."
2,Afghanistan,Jalalabad,,Jalalabad is on the major highway that links K...,,The large reservoir near Sarobi. This is loca...,Go fishing along the river side. Go swimming .,Handi Craft Handicraft is one of most famous ...,Pakora go for special “Pakora” Food in Hindu ...,The government hotel Spingar is on the easter...,,,This article is an outline and needs more co...,[],"[large, reservoir, near, sarobi, located, way,...","[go, fishing, along, river, side, go, swimming]","[handi, craft, handicraft, one, famous, produc...","[pakora, go, special, pakora, food, hindu, str..."
3,Afghanistan,Kandahar,Kandahār or Qandahār (Pashto: کندهار ) (Persia...,By plane[edit] Kandahar International Airport ...,,"Kandahar Museum, (Western end of the Eidgah ...",,"Afghanistan International Bank (AIB), Kabul Ba...","There are plenty of food choices in Kandahar, ...","Budget[edit] Armani Hotel, (Two miles from c...",,"As of 2013, the city centre of Kandahar is qui...",Kabul Helmand Harat Zabul Spin Boldak (Spin Bu...,"[kandahār, qandahār, pashto, کندهار, persian, ...","[kandahar, museum, western, end, eidgah, durwa...",[],"[afghanistan, international, bank, aib, kabul,...","[plenty, food, choice, kandahar, however, serf..."
4,Afghanistan,Kunduz,,Most visitors will arrive by road from Pol-e K...,"The city is fairly small. The local roads, li...",There's not much for tourists here.,,,The usual Afghan fare - kebab or lamb with ric...,,,,This article is an outline and needs more ...,[],"[much, tourist]",[],[],"[usual, afghan, fare, kebab, lamb, rice, river..."


In [11]:
# Most common words
words = []
for sublist in cities['See_tokens']:
    for item in sublist:
        words.append(item)
        
fdist = FreqDist(words)
fdist.most_common(50)

[('museum', 9761),
 ('edit', 9639),
 ('city', 6420),
 ('park', 5681),
 ('wa', 5456),
 ('one', 4413),
 ('ha', 4158),
 ('also', 3997),
 ('art', 3744),
 ('building', 3690),
 ('temple', 3563),
 ('1', 3229),
 ('located', 2933),
 ('house', 2858),
 ('church', 2801),
 ('built', 2800),
 ('area', 2800),
 ('street', 2716),
 ('old', 2699),
 ('place', 2636),
 ('free', 2614),
 ('garden', 2377),
 ('00', 2338),
 ('many', 2325),
 ('see', 2259),
 ('century', 2224),
 ('2', 2122),
 ('well', 2083),
 ('bus', 2076),
 ('5', 2068),
 ('take', 2055),
 ('town', 2025),
 ('de', 1998),
 ('st', 1996),
 ('10', 1980),
 ('open', 1970),
 ('world', 1966),
 ('year', 1901),
 ('around', 1900),
 ('center', 1879),
 ('history', 1827),
 ('station', 1804),
 ('view', 1753),
 ('day', 1751),
 ('road', 1743),
 ('collection', 1741),
 ('small', 1738),
 ('3', 1730),
 ('square', 1713),
 ('tour', 1700)]

In [12]:
# NLP

# Create dictionaries of unique words in the processed tokenized text
See_dict = corpora.Dictionary(cities['See_tokens'])

In [13]:
# Convert texts to vectors
See_corpus = [See_dict.doc2bow(text) for text in cities['See_tokens']]

In [14]:
# LDA
See_lda = gensim.models.ldamodel.LdaModel(corpus=See_corpus,
                                               id2word=See_dict,
                                               num_topics=30, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=1000,
                                               passes=1,
                                               alpha='auto',
                                               eta='auto')

In [15]:
# Visualize LDA results
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(See_lda, See_corpus, See_dict)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [16]:
# Recommendation

# Create vectors that represent topic distribution of each lyric or comment
See_topic_vectors = See_lda[See_corpus]

In [17]:
# Search cities

cities[cities['City'] == 'Beijing']

Unnamed: 0,Country,City,Understand,Get in,Get around,See,Do,Buy,Eat,Sleep,Cope,Stay safe,Get out,Understand_tokens,See_tokens,Do_tokens,Buy_tokens,Eat_tokens
321,China,Beijing,History[edit] Beijing literally means Northern...,By plane[edit] Beijing is generally served by ...,A note on maps Beijing is changing at such...,See the Districts articles for individual list...,See the Districts articles for individual list...,See the Districts articles for individual list...,See the Districts articles for individual list...,See the Districts articles for individual list...,Good bilingual maps are hard to find in Beijin...,"Crime[edit] Overall, Beijing is a very safe ci...",Long distance cyclist-tourists will find natio...,"[history, edit, beijing, literally, mean, nort...","[see, district, article, individual, listing, ...","[see, district, article, individual, listing, ...","[see, district, article, individual, listing, ...","[see, district, article, individual, listing, ..."


In [18]:
input_city = 321

In [19]:
See_sims = []
for i in np.arange(len(See_topic_vectors)):
    sim = gensim.matutils.cossim(See_topic_vectors[input_city], See_topic_vectors[i])
    See_sims.append(sim)
cities['Similarity to input'] = See_sims

In [20]:
cities.sort_values(by=['Similarity to input'], ascending = False).head(10)

Unnamed: 0,Country,City,Understand,Get in,Get around,See,Do,Buy,Eat,Sleep,Cope,Stay safe,Get out,Understand_tokens,See_tokens,Do_tokens,Buy_tokens,Eat_tokens,Similarity to input
321,China,Beijing,History[edit] Beijing literally means Northern...,By plane[edit] Beijing is generally served by ...,A note on maps Beijing is changing at such...,See the Districts articles for individual list...,See the Districts articles for individual list...,See the Districts articles for individual list...,See the Districts articles for individual list...,See the Districts articles for individual list...,Good bilingual maps are hard to find in Beijin...,"Crime[edit] Overall, Beijing is a very safe ci...",Long distance cyclist-tourists will find natio...,"[history, edit, beijing, literally, mean, nort...","[see, district, article, individual, listing, ...","[see, district, article, individual, listing, ...","[see, district, article, individual, listing, ...","[see, district, article, individual, listing, ...",0.999994
1076,India,Patan,,"Micro Bus from Ratnapark in Kathmandu (12 rps,...",The city and its sites are best seen on foot. ...,Durbar Square[edit] NOTICE: Many of the temple...,Just wander around keeping your eyes open for ...,Wooden and Metal Handicrafts. There are many m...,"Cafe de Patan - a pleasant, centraly located r...",While most tourists stay in hotels in Kathmand...,,,Kirtipur is one of the oldest settlements in ...,[],"[durbar, square, edit, notice, many, temple, l...","[wander, around, keeping, eye, open, amazing, ...","[wooden, metal, handicraft, many, metalworking...","[cafe, de, patan, pleasant, centraly, located,...",0.989097
1122,India,Thiruvananthapuram,Sanskrit History[edit] Thiruvananthapuram is...,By plane[edit] Trivandrum International Airpor...,By bus[edit] Malayalam East Fort Thampanoor By...,"Thiruvananthapuram is a historic city, dotted ...",Stroll around the town during the dusk time w...,Haggling is quite acceptable and very much nee...,This guide uses the following price ranges ...,Unless you are in Trivandrum on a business tri...,Tourist Information[edit] There is a tourist i...,Trivandrum is generally a safe city.Although y...,Ponmudi - A pleasant hill resort 61 km away f...,"[sanskrit, history, edit, thiruvananthapuram, ...","[thiruvananthapuram, historic, city, dotted, m...","[stroll, around, town, dusk, time, city, come,...","[haggling, quite, acceptable, much, needed, bu...","[guide, us, following, price, range, typical, ...",0.986906
320,China,Anyang,Anyang is 460 km south of Beijing and is a pre...,"By bus or train. From Beijing, about 3.5 hour...","Bus Y1 runs between Anyang East Train Station,...","Yin Ruins (﻿殷墟; Yīnxū), (taxi fare from down...",,,,"Super 8 Hotel (速8酒店), (next to the ticket ha...",,,This article is an outline and needs more ...,"[anyang, 460, km, south, beijing, prefecture, ...","[yin, ruin, 殷墟, yīnxū, taxi, fare, downtown, 1...",[],[],[],0.968836
678,Côte d'Ivoire,Yamoussoukro,,Every UOB bus from Abijan to Bouake or Man sto...,,"The public gardens, the Fondation Felix Houpho...",,,Chez Georges Hollywood. The Chez Georges Holly...,"Hotel President. edit Hotel Villa des Hautes,...",,,This article is an outline and needs more co...,[],"[public, garden, fondation, felix, houphouet, ...",[],[],"[chez, george, hollywood, chez, george, hollyw...",0.968624
1034,India,Kottayam,"Kottayam is a town in south-central Kerala, sa...",By plane[edit] Kochi International Airport [4]...,Bus - Kottayam has a comprehenhsive local bus...,"Aruvikkuzhi Waterfalls, (18 km from Kottayam...",Boat Races. Lots of boat races with tradition...,,There are numerous restaurants and cafes in Ko...,There are a number of hotels of varying qualit...,,,,"[kottayam, town, south, central, kerala, sandw...","[aruvikkuzhi, waterfall, 18, km, kottayam, tow...","[boat, race, lot, boat, race, traditional, boa...",[],"[numerous, restaurant, cafe, kottayam, town, l...",0.964685
854,Ghana,Kumasi,Kumasi is considered the home of the Ashanti K...,Kumasi is a cosmopolitan city with a good road...,There are many ways to get around Kumasi. All...,Asantehene's Palace. A visit to this former k...,A tour of the Palace grounds explains the hist...,Kumasi has many craft villages nearby. Asoafua...,Budget[edit] Mid-range[edit] Splurge[edit] O'N...,"TUMI Hostel, (Located in Asokwa- Landmark Fou...",,,This is a usable article. It has informatio...,"[kumasi, considered, home, ashanti, king, curr...","[asantehene, palace, visit, former, king, resi...","[tour, palace, ground, explains, history, asha...","[kumasi, ha, many, craft, village, nearby, aso...","[budget, edit, mid, range, edit, splurge, edit...",0.963286
525,China,Tianjin,Climate Jan Feb Mar Apr May Jun Jul ...,By plane[edit] Tianjin Binhai International Ai...,"By bus[edit] Founded in 1904, the Tianjin bus ...",Buildings[edit] The Five Avenues (五大道; Wǔdàdà...,There are a number of attractions of interest ...,Tianjin has both modern shopping malls and dis...,Tianjin cuisine places a heavy focus on seafoo...,"Tianjin Saixiang Hotel (天津赛象酒店), NO.8, Meiyuan...",,Tianjin is known to be as safe as any city in ...,This is a usable article. It has inform...,"[climate, jan, feb, mar, apr, may, jun, jul, a...","[building, edit, five, avenue, 五大道, wǔdàdào, l...","[number, attraction, interest, foreign, travel...","[tianjin, ha, modern, shopping, mall, distinct...","[tianjin, cuisine, place, heavy, focus, seafoo...",0.957724
899,India,Ahmednagar,,Ahmednagar is located in the western central p...,"Walking, bicycling, auto rickshaws (cost money...",Ahmednagar Fort Built by Ahmed Nizam Shah in 1...,Eat at local dhabas or on way form Pune at sar...,"Clothes S k Nari Fashion (Babulal Dembla), ma...","Elements, Iris Premiere, Station Road Little...","Hotel Iris Premiere, Station Road url=""http:/...",,,Pune Aurangabad Paithan Nashik This ar...,[],"[ahmednagar, fort, built, ahmed, nizam, shah, ...","[eat, local, dhabas, way, form, pune, saradwad...","[clothes, k, nari, fashion, babulal, dembla, m...","[element, iris, premiere, station, road, littl...",0.937979
1093,India,Ranchi,"Ranchi is located on the Chotanagpur plateau, ...",By air[edit] The Ranchi Airport (Code: IXR) is...,"Hired cars, autorickshaws and cycle rickshaws ...",Kanke Dam Dhurva Dam- A reservoir on the out...,,Firayalal [5] Kashmir Vastralaya Big Shop [...,"On Main Road[edit] Kaveri Restaurant, GEL Chu...",Hotel Rajdhani Plaza Church Road 09771490335[...,,Do not be carried away by media coverage of st...,"Hundru Falls – about 28 km from Ranchi, on Su...","[ranchi, located, chotanagpur, plateau, height...","[kanke, dam, dhurva, dam, reservoir, outskirt,...",[],"[firayalal, 5, kashmir, vastralaya, big, shop,...","[main, road, edit, kaveri, restaurant, gel, ch...",0.931145
