In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import time
from IPython.display import clear_output
%matplotlib inline  

import warnings

with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)
    import nltk
    from nltk.tokenize import RegexpTokenizer
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.corpus import stopwords, words
    from nltk import FreqDist

    import gensim
    from gensim import corpora, models, similarities
    import pyLDAvis
    import pyLDAvis.gensim

  from collections import Sequence
  from collections import Mapping, Set, Iterable, Iterator, defaultdict
  from collections import Mapping, Set, Iterable, Iterator, defaultdict
  from collections import Hashable


In [2]:
cities = pd.read_csv('cities_geo2_df.csv', index_col=0)

# Reset index
cities = cities.reset_index()
cities.drop(['index'], axis=1, inplace=True)

In [3]:
#cities.head()

In [4]:
# Run once
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('words')

In [5]:
tokenizer = RegexpTokenizer(r'\w+') # retains alphanumeric character
lmtzr = WordNetLemmatizer()
cached_stop_words = stopwords.words("english")

In [37]:
def lemmatize_dropstop_words(text): 
    print(text)
    clear_output(wait=True)
    return [lmtzr.lemmatize(word) for word in text if 
            (len(word)>2) and 
            (lmtzr.lemmatize(word) not in cached_stop_words) and 
            (word in words.words())
            ]

def remove_num(text): 
    return re.sub("\d+", " ", text)

In [34]:
cities = cities.replace(np.nan, '', regex=True)

In [35]:
# Add my stop words from txt file
my_stop_words = np.genfromtxt('my_stop_words.txt',dtype='str', delimiter=', ').tolist()
cached_stop_words.extend(my_stop_words)

In [36]:
# Tokenize, lemmatize
cities['See_tokens'] = cities['See'].str.lower().apply(tokenizer.tokenize).apply(lemmatize_dropstop_words)

ghala
bagh
the


KeyboardInterrupt: 

In [None]:
# Tokenize, lemmatize
cities['Do_tokens'] = cities['Do'].str.lower().apply(tokenizer.tokenize).apply(lemmatize_dropstop_words)

In [None]:
# Pool tokens
def pool_tokens(row):
    return row['See_tokens']+row['Do_tokens']

cities['Pooled_tokens'] = cities.apply(pool_tokens, axis=1)

In [None]:
# Most common words
words = []
for sublist in cities['Pooled_tokens']:
    for item in sublist:
        words.append(item)
        
fdist = FreqDist(words)
fdist.most_common(20)

In [None]:
# Save cities with final index
cities.to_csv('cities_text_processed_df.csv')

# LDA model

In [None]:
# NLP

# Create dictionaries of unique words in the processed tokenized text
Pooled_dict = corpora.Dictionary(cities['Pooled_tokens'])

In [None]:
# Filter extreme values
Pooled_dict.filter_extremes(no_below=15)#, no_above=0.5)

In [None]:
# Convert texts to vectors
Pooled_corpus = [Pooled_dict.doc2bow(text) for text in cities['Pooled_tokens']]

In [None]:
# LDA
Pooled_lda = gensim.models.ldamodel.LdaModel(corpus=Pooled_corpus,
                                               id2word=Pooled_dict,
                                               num_topics=50, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=1000,
                                               passes=1,
                                               alpha='auto',
                                               eta='auto')

# Visualize LDA results
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(Pooled_lda, Pooled_corpus, Pooled_dict)
vis

In [None]:
# Recommendation

# Create vectors that represent topic distribution of each lyric or comment
Pooled_topic_vectors = Pooled_lda[Pooled_corpus]

In [None]:
# Search cities
cities[cities['City'] == 'New York City']

In [None]:
input_city = 2527

In [None]:
Pooled_sims = []
for i in np.arange(len(Pooled_topic_vectors)):
    sim = gensim.matutils.cossim(Pooled_topic_vectors[input_city], Pooled_topic_vectors[i])
    Pooled_sims.append(sim)
cities['Similarity to input'] = Pooled_sims

In [None]:
#cities.sort_values(by=['Similarity to input'], ascending = False).head(10)

In [None]:
# Calculating all pairwise cosine similarities
# ~10 hours to run
RERUN = False

if RERUN == True:
    cosims_df = pd.DataFrame()

    start_time = time.time()
    for i in np.arange((len(Pooled_topic_vectors))):
        print(i)
        clear_output(wait=True)
        j_list = []
        for num in np.arange(i):
            j_list.append(cosims_df.iloc[i,num])
        for j in np.arange(i,(len(Pooled_topic_vectors))):
            sim = gensim.matutils.cossim(Pooled_topic_vectors[i], Pooled_topic_vectors[j])
            j_list.append(sim)
        cosims_df = pd.concat([cosims_df,pd.Series(j_list)],axis=1)

    elapsed_time = time.time() - start_time
    print(elapsed_time)
    cosims_df.columns=(np.arange(len(cosims_df)))
    cosims_df.to_csv('data/cos_sims.csv')
    cosims_stacked = cosims_df.stack() 
    cosims_stacked.to_csv('data/cos_sims_stacked.csv') # Save a stacked version that can be put on SQL server
else:
    cosims_df = pd.read_csv('data/cos_sims.csv', index_col = 0)

# Feature-engineered keyword Bag-of-Words model

In [None]:
# TF-IDF

In [None]:
# NLP

# Create dictionaries of unique words in the processed tokenized text
Pooled_dict = corpora.Dictionary(cities['Pooled_tokens'])
Pooled_dict.filter_extremes(no_below=15, no_above=0.3)

# Convert texts to vectors
Pooled_corpus = [Pooled_dict.doc2bow(text) for text in cities['Pooled_tokens']]

In [None]:
# TF-IDF
tfidf = models.TfidfModel(Pooled_corpus)
Pooled_corpus_tfidf = tfidf[Pooled_corpus]

In [None]:
# TF-IDF LDA
Pooled_tfidf_lda = gensim.models.ldamodel.LdaModel(corpus=Pooled_corpus_tfidf,
                                               id2word=Pooled_dict,
                                               num_topics=50, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=1000,
                                               passes=1,
                                               alpha='auto',
                                               eta='auto')

# Visualize LDA results
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(Pooled_tfidf_lda, Pooled_corpus_tfidf, Pooled_dict)
vis

In [None]:
# TF-IDF recs
Pooled_tfidf_topic_vectors = Pooled_tfidf_lda[Pooled_corpus_tfidf]

Pooled_sims = []
for i in np.arange(len(Pooled_tfidf_topic_vectors)):
    sim = gensim.matutils.cossim(Pooled_tfidf_topic_vectors[input_city], Pooled_tfidf_topic_vectors[i])
    Pooled_sims.append(sim)
cities['Similarity to input'] = Pooled_sims