In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist

  from collections import Sequence


In [2]:
cities_raw = pd.read_csv('cities_geo2_df.csv', index_col=0)
cities = cities_raw.loc[:,:'Get out'] # Just text portions of the df

In [3]:
cities.head()

Unnamed: 0,Country,City,Understand,Get in,Get around,See,Do,Buy,Eat,Sleep,Cope,Stay safe,Get out
0,Afghanistan,Herat,Herat is the second largest city in Afghanista...,The Herat International Airport is situated 15...,,"Ghala Ekhteyaradin, Takht Safar, Bagh Milat, ...",Take a shower in the huge subterranean Hammams.,There are several antique shops on the north ...,Herat has a mixture of traditional and modern ...,"Budget[edit] Mowafaq Hotel, northeast corner ...",,Herat is one of the safer cities in Afghanista...,Chisht-i-Sharif is some 177 km from Herat city...
1,Afghanistan,Kabul,Kabul is a very historic city of the region th...,By plane[edit] Kabul International Airport (IA...,Maps of Kabul are available from Afghanistan I...,Bagh-e Babur (Gardens of Babur). The gardens ...,Kabul Wall. A pleasant hike with rewarding vi...,The Share-e Naw area has some shops. The Kab...,The once thriving restaurant scene in Kabul ha...,"Kabul is not a cheap place to stay, principall...",Read the Scene magazine for restaurant reviews...,Kabul is generally considered one of the safer...,Most expats take any opportunity they can to l...
2,Afghanistan,Jalalabad,,Jalalabad is on the major highway that links K...,,The large reservoir near Sarobi. This is loca...,Go fishing along the river side. Go swimming .,Handi Craft Handicraft is one of most famous ...,Pakora go for special “Pakora” Food in Hindu ...,The government hotel Spingar is on the easter...,,,This article is an outline and needs more co...
3,Afghanistan,Kandahar,Kandahār or Qandahār (Pashto: کندهار ) (Persia...,By plane[edit] Kandahar International Airport ...,,"Kandahar Museum, (Western end of the Eidgah ...",,"Afghanistan International Bank (AIB), Kabul Ba...","There are plenty of food choices in Kandahar, ...","Budget[edit] Armani Hotel, (Two miles from c...",,"As of 2013, the city centre of Kandahar is qui...",Kabul Helmand Harat Zabul Spin Boldak (Spin Bu...
4,Afghanistan,Kunduz,,Most visitors will arrive by road from Pol-e K...,"The city is fairly small. The local roads, li...",There's not much for tourists here.,,,The usual Afghan fare - kebab or lamb with ric...,,,,This article is an outline and needs more ...


In [4]:
# Run once
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')

In [5]:
tokenizer = RegexpTokenizer(r'\w+') # retains alphanumeric character
lmtzr = WordNetLemmatizer()
cached_stop_words = stopwords.words("english")

In [6]:
def lemmatize_dropstop_words(text): 
    return [lmtzr.lemmatize(word) for word in text if lmtzr.lemmatize(word) not in cached_stop_words]

In [24]:
cities = cities.replace(np.nan, '', regex=True)

In [25]:
cities.head()

Unnamed: 0,Country,City,Understand,Get in,Get around,See,Do,Buy,Eat,Sleep,Cope,Stay safe,Get out,Understand_tokens
0,Afghanistan,Herat,Herat is the second largest city in Afghanista...,The Herat International Airport is situated 15...,,"Ghala Ekhteyaradin, Takht Safar, Bagh Milat, ...",Take a shower in the huge subterranean Hammams.,There are several antique shops on the north ...,Herat has a mixture of traditional and modern ...,"Budget[edit] Mowafaq Hotel, northeast corner ...",,Herat is one of the safer cities in Afghanista...,Chisht-i-Sharif is some 177 km from Herat city...,"[herat, second, largest, city, afghanistan, lo..."
1,Afghanistan,Kabul,Kabul is a very historic city of the region th...,By plane[edit] Kabul International Airport (IA...,Maps of Kabul are available from Afghanistan I...,Bagh-e Babur (Gardens of Babur). The gardens ...,Kabul Wall. A pleasant hike with rewarding vi...,The Share-e Naw area has some shops. The Kab...,The once thriving restaurant scene in Kabul ha...,"Kabul is not a cheap place to stay, principall...",Read the Scene magazine for restaurant reviews...,Kabul is generally considered one of the safer...,Most expats take any opportunity they can to l...,"[kabul, historic, city, region, wa, built, alm..."
2,Afghanistan,Jalalabad,,Jalalabad is on the major highway that links K...,,The large reservoir near Sarobi. This is loca...,Go fishing along the river side. Go swimming .,Handi Craft Handicraft is one of most famous ...,Pakora go for special “Pakora” Food in Hindu ...,The government hotel Spingar is on the easter...,,,This article is an outline and needs more co...,[]
3,Afghanistan,Kandahar,Kandahār or Qandahār (Pashto: کندهار ) (Persia...,By plane[edit] Kandahar International Airport ...,,"Kandahar Museum, (Western end of the Eidgah ...",,"Afghanistan International Bank (AIB), Kabul Ba...","There are plenty of food choices in Kandahar, ...","Budget[edit] Armani Hotel, (Two miles from c...",,"As of 2013, the city centre of Kandahar is qui...",Kabul Helmand Harat Zabul Spin Boldak (Spin Bu...,"[kandahār, qandahār, pashto, کندهار, persian, ..."
4,Afghanistan,Kunduz,,Most visitors will arrive by road from Pol-e K...,"The city is fairly small. The local roads, li...",There's not much for tourists here.,,,The usual Afghan fare - kebab or lamb with ric...,,,,This article is an outline and needs more ...,[]


In [26]:
# Tokenize, lemmatize
cities['Understand_tokens'] = cities['Understand'].str.lower().apply(tokenizer.tokenize).apply(lemmatize_dropstop_words)
cities['See_tokens'] = cities['See'].str.lower().apply(tokenizer.tokenize).apply(lemmatize_dropstop_words)
cities['Do_tokens'] = cities['Do'].str.lower().apply(tokenizer.tokenize).apply(lemmatize_dropstop_words)
cities['Buy_tokens'] = cities['Buy'].str.lower().apply(tokenizer.tokenize).apply(lemmatize_dropstop_words)
cities['Eat_tokens'] = cities['Eat'].str.lower().apply(tokenizer.tokenize).apply(lemmatize_dropstop_words)

In [27]:
cities.head()

Unnamed: 0,Country,City,Understand,Get in,Get around,See,Do,Buy,Eat,Sleep,Cope,Stay safe,Get out,Understand_tokens,See_tokens,Do_tokens,Buy_tokens,Eat_tokens
0,Afghanistan,Herat,Herat is the second largest city in Afghanista...,The Herat International Airport is situated 15...,,"Ghala Ekhteyaradin, Takht Safar, Bagh Milat, ...",Take a shower in the huge subterranean Hammams.,There are several antique shops on the north ...,Herat has a mixture of traditional and modern ...,"Budget[edit] Mowafaq Hotel, northeast corner ...",,Herat is one of the safer cities in Afghanista...,Chisht-i-Sharif is some 177 km from Herat city...,"[herat, second, largest, city, afghanistan, lo...","[ghala, ekhteyaradin, takht, safar, bagh, mila...","[take, shower, huge, subterranean, hammams]","[several, antique, shop, north, side, mosque, ...","[herat, ha, mixture, traditional, modern, food..."
1,Afghanistan,Kabul,Kabul is a very historic city of the region th...,By plane[edit] Kabul International Airport (IA...,Maps of Kabul are available from Afghanistan I...,Bagh-e Babur (Gardens of Babur). The gardens ...,Kabul Wall. A pleasant hike with rewarding vi...,The Share-e Naw area has some shops. The Kab...,The once thriving restaurant scene in Kabul ha...,"Kabul is not a cheap place to stay, principall...",Read the Scene magazine for restaurant reviews...,Kabul is generally considered one of the safer...,Most expats take any opportunity they can to l...,"[kabul, historic, city, region, wa, built, alm...","[bagh, e, babur, garden, babur, garden, surrou...","[kabul, wall, pleasant, hike, rewarding, view,...","[share, e, naw, area, ha, shop, kabul, city, c...","[thriving, restaurant, scene, kabul, ha, suffe..."
2,Afghanistan,Jalalabad,,Jalalabad is on the major highway that links K...,,The large reservoir near Sarobi. This is loca...,Go fishing along the river side. Go swimming .,Handi Craft Handicraft is one of most famous ...,Pakora go for special “Pakora” Food in Hindu ...,The government hotel Spingar is on the easter...,,,This article is an outline and needs more co...,[],"[large, reservoir, near, sarobi, located, way,...","[go, fishing, along, river, side, go, swimming]","[handi, craft, handicraft, one, famous, produc...","[pakora, go, special, pakora, food, hindu, str..."
3,Afghanistan,Kandahar,Kandahār or Qandahār (Pashto: کندهار ) (Persia...,By plane[edit] Kandahar International Airport ...,,"Kandahar Museum, (Western end of the Eidgah ...",,"Afghanistan International Bank (AIB), Kabul Ba...","There are plenty of food choices in Kandahar, ...","Budget[edit] Armani Hotel, (Two miles from c...",,"As of 2013, the city centre of Kandahar is qui...",Kabul Helmand Harat Zabul Spin Boldak (Spin Bu...,"[kandahār, qandahār, pashto, کندهار, persian, ...","[kandahar, museum, western, end, eidgah, durwa...",[],"[afghanistan, international, bank, aib, kabul,...","[plenty, food, choice, kandahar, however, serf..."
4,Afghanistan,Kunduz,,Most visitors will arrive by road from Pol-e K...,"The city is fairly small. The local roads, li...",There's not much for tourists here.,,,The usual Afghan fare - kebab or lamb with ric...,,,,This article is an outline and needs more ...,[],"[much, tourist]",[],[],"[usual, afghan, fare, kebab, lamb, rice, river..."


In [34]:
# Most common words
words = []
for sublist in cities['See_tokens']:
    for item in sublist:
        words.append(item)
        
fdist = FreqDist(words)
fdist.most_common(50)

[('museum', 9761),
 ('edit', 9639),
 ('city', 6420),
 ('park', 5681),
 ('wa', 5456),
 ('one', 4413),
 ('ha', 4158),
 ('also', 3997),
 ('art', 3744),
 ('building', 3690),
 ('temple', 3563),
 ('1', 3229),
 ('located', 2933),
 ('house', 2858),
 ('church', 2801),
 ('built', 2800),
 ('area', 2800),
 ('street', 2716),
 ('old', 2699),
 ('place', 2636),
 ('free', 2614),
 ('garden', 2377),
 ('00', 2338),
 ('many', 2325),
 ('see', 2259),
 ('century', 2224),
 ('2', 2122),
 ('well', 2083),
 ('bus', 2076),
 ('5', 2068),
 ('take', 2055),
 ('town', 2025),
 ('de', 1998),
 ('st', 1996),
 ('10', 1980),
 ('open', 1970),
 ('world', 1966),
 ('year', 1901),
 ('around', 1900),
 ('center', 1879),
 ('history', 1827),
 ('station', 1804),
 ('view', 1753),
 ('day', 1751),
 ('road', 1743),
 ('collection', 1741),
 ('small', 1738),
 ('3', 1730),
 ('square', 1713),
 ('tour', 1700)]

In [35]:
# NLP