# Notebook for topic modeling 

# 0. Imports

In [1]:
## load packages 
import pandas as pd
import re
import numpy as np

## nltk imports
import nltk
nltk.download("stopwords")
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## sklearn imports
from sklearn.feature_extraction.text import CountVectorizer

## lda 
from gensim import corpora
import gensim

## visualization - if you have issues installing comment out
import pyLDAvis.gensim as gensimvis
# alternate: import pyLDAvis.gensim_models as gensimvis 
import pyLDAvis
#pyLDAvis.enable_notebook()

## print mult things
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## random
import random

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rebeccajohnson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 0. Load data

In [2]:
ab = pd.read_csv("../../../public_data/airbnb_text.zip")
ab.head()


Unnamed: 0,id,name,name_upper,neighbourhood_group,price
0,2539,Clean & quiet apt home by the park,CLEAN & QUIET APT HOME BY THE PARK,Brooklyn,149
1,2595,Skylit Midtown Castle,SKYLIT MIDTOWN CASTLE,Manhattan,225
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,THE VILLAGE OF HARLEM....NEW YORK !,Manhattan,150
3,3831,Cozy Entire Floor of Brownstone,COZY ENTIRE FLOOR OF BROWNSTONE,Brooklyn,89
4,5022,Entire Apt: Spacious Studio/Loft by central park,ENTIRE APT: SPACIOUS STUDIO/LOFT BY CENTRAL PARK,Manhattan,80


# 1. Preprocess documents

In this case, each name/name_upper, or listing title, we're treating as a document

## 1.1 Load stopwords list and augment with our own custom ones

In [3]:
list_stopwords = stopwords.words("english")
custom_words_toadd = ['apartment', 'new york', 'nyc',
                      'bronx', 'brooklyn',
                     'manhattan', 'queens', 
                      'staten island']
list_stopwords_new = list_stopwords + custom_words_toadd


## 1.2 Remove stopwords from lowercase version of corpus


In [4]:
## convert to lowercase and a list
corpus_lower = ab.name.str.lower().to_list()

## use wordpunct tokenize and filter out with one
example_listing = corpus_lower[3]
wordpunct_tokenize(example_listing)
nostop_listing = [word 
                for word in wordpunct_tokenize(example_listing) 
                if word not in list_stopwords_new]


['cozy', 'entire', 'floor', 'of', 'brownstone']

## 1.3 stem and remove non-alpha

Other contexts we may want to leave digits in

In [5]:
## initialize stemmer
porter = PorterStemmer()

## apply to one by iterating
## over the tokens in the list
example_listing_preprocess = [porter.stem(token) 
                            for token in nostop_listing 
                            if token.isalpha() and 
                            len(token) > 2]


## 1.4 Activity 1

The above example performed preprocessing on a single Airbnb listing. We want to generalize this preprocessing across all listings.

- Embed step two (remove stopwords) and step three (stem) into one or two functions that take in a raw string (eg the raw text of an Airbnb review) and return a preprocessed string 
- Apply the function to all the texts in `corpus_lower`

In [6]:
def process_step1(one_str):   
    try:
        nostop_listing1 = [word for word in wordpunct_tokenize(one_str)
                          if word not in list_stopwords_new]
        clean_listing = [porter.stem(word) for word in nostop_listing1
                        if word.isalpha() 
                        and len(word) > 3]
        clean_listing_str = " ".join(clean_listing)
        return(clean_listing_str)
    except:
        return("")

cleaned_listings = [process_step1(one_listing) for one_listing in 
                   corpus_lower]


In [7]:
ab['proc_name'] = cleaned_listings


# 2. Create a document-term matrix and do some basic diagnostics (more manual approach)

Here we'll create a DTM first using the raw documents; in the activity, you'll create one using the preprocessed docs
that you created in activity 1

## 2.1 Define the dtm function and select data to transform into a document-term matrix

In [8]:
## function I'm providing
def create_dtm(list_of_strings, metadata):
    vectorizer = CountVectorizer(lowercase = True)
    dtm_sparse = vectorizer.fit_transform(list_of_strings)
    dtm_dense_named = pd.DataFrame(dtm_sparse.todense(),
                columns=vectorizer.get_feature_names())
    metadata.columns = ["metadata_" + col for col in metadata.columns]
    dtm_dense_named_withid = pd.concat([metadata.reset_index(), 
                                        dtm_dense_named], axis = 1)
    return(dtm_dense_named_withid)

In [9]:
## filter out na's
## for shorter runtime, random sampling of 1000
## get metadata for those
## and also renaming price col since it's likely to be corpus word
ab_small = ab.loc[~ab.name.isnull(),
           ['id', 'neighbourhood_group', 'price',
            'name']].copy().rename(columns = {'price':
            'price_rawdata'}).sample(n = 1000, random_state = 9899)

ab_small['name_lower'] = ab_small['name'].str.lower()
ab_small.head()

Unnamed: 0,id,neighbourhood_group,price_rawdata,name,name_lower
22540,18227529,Manhattan,140,Live in New York Near Central Park and Columbi...,live in new york near central park and columbi...
47531,35794273,Brooklyn,130,Hope Garden,hope garden
33906,26858196,Brooklyn,75,AWESOME 2 BEDS - QUEEN + SOFA - NEXT TO METRO,awesome 2 beds - queen + sofa - next to metro
12047,9369514,Brooklyn,25,STARTUP CHEAP PLACE BROOKLYN,startup cheap place brooklyn
2908,1669149,Manhattan,250,Beautiful Modern Midtown Apartment,beautiful modern midtown apartment


## 2.2 Execute the dtm function to create the document-term matrix

In [10]:
## example application on raw lowercase texts; 
dtm_nopre = create_dtm(list_of_strings= ab_small.name_lower,
                metadata = 
                ab_small[['id', 'neighbourhood_group', 'price_rawdata']])

dtm_nopre.head()

Unnamed: 0,index,metadata_id,metadata_neighbourhood_group,metadata_price_rawdata,10,100,1000,1000sq,10292,10ft,...,交通便利,位于北上远离开辆,家庭式獨立衛生間套房g,温馨小筑,简单的四房一厅两卫生间,纽约之家,走路四分钟到地铁站,건물,따뜻한,작은
0,22540,18227529,Manhattan,140,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,47531,35794273,Brooklyn,130,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,33906,26858196,Brooklyn,75,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,12047,9369514,Brooklyn,25,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2908,1669149,Manhattan,250,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 2.3 Use that matrix/column sums to get basic summary stats of top words

In [11]:
## summing each col
top_terms = dtm_nopre[[col for col in dtm_nopre.columns
                      if "metadata" not in col and
                      col != "index"]].sum(axis = 0)

## sorting from most frequent to least frequent
top_terms.sort_values(ascending = False)

in            357
room          204
bedroom       164
private       146
cozy          134
             ... 
fitness         1
fl              1
flatbushbk      1
flatiron        1
작은              1
Length: 979, dtype: int64

## 2.4 Activity 2: repeat the above but using the preprocessed text data

- Stick with the same random sample of 1000 `ab_small`
- Apply the preprocessing steps from activity 1 to create a new column in `ab_small` with the 
preprocessed text (if you got stuck on that, try just removing stopwords)
- Use the `create_dtm` function to create a document-term matrix from the preprocessed data
-  Take the sum of each of the term columns to find the top words 

In [12]:
ab_small['processed'] = [process_step1(one_list)
                         for one_list in ab_small.name_lower.to_list()]

In [13]:
metadata_list = ['id', 'neighbourhood_group', 'price_rawdata']
dtm_pre = create_dtm(list_of_strings= ab_small.processed,
                metadata = 
                ab_small[metadata_list])


dtm_pre_termsonly = dtm_pre[[col for col in dtm_pre.columns if 
                            "metadata" not in col and col != "index"]]

dtm_pre_termsonly.sum().sort_values(ascending = False)

room         208
bedroom      167
privat       146
cozi         134
studio        94
            ... 
industri       1
indoor         1
independ       1
incred         1
走路四分钟到地铁站      1
Length: 642, dtype: int64

# 3. Use gensim to more automatically preprocess/estimate a topic model

## 3.1 Creating the objects to feed the LDA modeling function

Different outputs described below: 
- Tokenized and preprocessed text 
- Dictionary 
- Corpus 

In [14]:
## Step 1: re-tokenize and store in list
## here, i'm doing with the raw random sample of text
## in activity, you should do with the preprocessed texts
text_raw_tokens = [wordpunct_tokenize(one_text) 
                for one_text in 
                ab_small.name_lower]

## Step 2: use gensim create dictionary - gets all unique words across documents
text_raw_dict = corpora.Dictionary(text_raw_tokens)

## Step 3: filter out very rare and very common words
## here, i'm using the threshold that a word needs to appear in at least
## 5% of docs but not more than 95%
## this is an integer count of docs so i round
lower_bound = round(ab_small.shape[0]*0.05)
upper_bound = round(ab_small.shape[0]*0.95)

### apply filtering to dictionary
text_raw_dict.filter_extremes(no_below = lower_bound,
                             no_above = upper_bound)

# Step 4: apply dictionary to TOKENIZED texts
## this creates a mapping between each word 
## in a specific listing and the key in the dictionary
## for words that remain in the filtered dictionary
## output is a list where len(list) == n documents
## and each element in the list is a list of tuples
## containing the mappings
corpus_fromdict = [text_raw_dict.doc2bow(one_text) 
                   for one_text in text_raw_tokens]



##  3.2 Estimating the model

In [15]:
## Step 5: we're finally ready to estimate the model!
## full documentation here - https://radimrehurek.com/gensim/models/ldamodel.html
## here, we're feed the lda function (1) the corpus we created from the dictionary
## (2) a parameter we decide on for the number of topics,
## (3) the dictionary itself,
## (4) parameter for number of passes through training data
## (5) parameter that returns, for each word remaining in dict, the 
## topic probabilities
## see documentation for many other arguments you can vary
ldamod = gensim.models.ldamodel.LdaModel(corpus_fromdict, 
                                num_topics = 4, id2word=text_raw_dict, 
                                passes=6, alpha = 'auto',
                                per_word_topics = True, random_state = 91988)


## 3.3  Seeing what topics the estimated model discovers

In [16]:
## Post-model 1: explore corpus-wide summary of topics
### getting the topics and top words; can retrieve diff top words
topics = ldamod.print_topics(num_words = 10)
for topic in topics:
    print(topic)


(0, '0.218*"in" + 0.122*"apartment" + 0.101*"room" + 0.089*"the" + 0.076*"of" + 0.055*"manhattan" + 0.044*"apt" + 0.044*"near" + 0.038*"to" + 0.036*"park"')
(1, '0.134*"bedroom" + 0.110*"in" + 0.110*"," + 0.104*"-" + 0.086*"1" + 0.077*"spacious" + 0.070*"east" + 0.046*"to" + 0.040*"apartment" + 0.039*"sunny"')
(2, '0.168*"cozy" + 0.109*"," + 0.094*"/" + 0.081*"park" + 0.077*"room" + 0.065*"with" + 0.058*"2" + 0.047*"!" + 0.039*"-" + 0.039*"and"')
(3, '0.156*"private" + 0.124*"." + 0.099*"studio" + 0.092*"in" + 0.083*"room" + 0.057*"!" + 0.044*"to" + 0.043*"brooklyn" + 0.039*"manhattan" + 0.037*"/"')


In [17]:
    
## Post-model 2: explore topics associated with each document
### for each item in our original dictionary, get list of topic probabilities
l=[ldamod.get_document_topics(item) for item in corpus_fromdict]
### print result
text_raw_tokens[0:5]
l[0:5]

[['live',
  'in',
  'new',
  'york',
  'near',
  'central',
  'park',
  'and',
  'columbia',
  'u',
  '.'],
 ['hope', 'garden'],
 ['awesome',
  '2',
  'beds',
  '-',
  'queen',
  '+',
  'sofa',
  '-',
  'next',
  'to',
  'metro'],
 ['startup', 'cheap', 'place', 'brooklyn'],
 ['beautiful', 'modern', 'midtown', 'apartment']]

[[(0, 0.7291896), (1, 0.041319154), (2, 0.031489335), (3, 0.19800192)],
 [(0, 0.24624164), (1, 0.2782907), (2, 0.212672), (3, 0.2627957)],
 [(0, 0.043327797), (1, 0.8722155), (2, 0.03763539), (3, 0.04682137)],
 [(0, 0.11719994), (1, 0.13181882), (2, 0.09832003), (3, 0.6526612)],
 [(0, 0.65151316), (1, 0.13048814), (2, 0.0974924), (3, 0.12050633)]]

### Visualizing 

In [18]:

lda_display = gensimvis.prepare(ldamod, corpus_fromdict, text_raw_dict)
pyLDAvis.display(lda_display)

## 3.4 Activity 3

- Preprocess the texts
- Repeat the preprocessing steps and running of the topic model with preprocessed texts (can also play around with other parameters like n_topics)- what seems to produce useful topics?



In [19]:
# your code here
ab_small = ab_small[ab_small.processed != ""].copy()

tokenized_text = [wordpunct_tokenize(one_text) 
                for one_text in 
                ab_small.processed]


In [20]:
## preprocess and estimate topicmod
### create dictionary
text_proc_dict = corpora.Dictionary(tokenized_text)
### filter dictionary- using 2% as bounds
text_proc_dict.filter_extremes(no_below = round(ab_small.shape[0]*0.02),
                             no_above = round(ab_small.shape[0]*0.98))

### create corpus from dictionary
corpus_fromdict_proc = [text_proc_dict.doc2bow(one_text) 
                       for one_text in tokenized_text]


In [21]:
### estimate model
n_topics = 3
ldamod_proc = gensim.models.ldamodel.LdaModel(corpus_fromdict_proc, 
                                         num_topics = n_topics, id2word=text_proc_dict, 
                                         passes=6, alpha = 'auto',
                                        per_word_topics = True, random_state = 91988)

### print topics and words
topics = ldamod_proc.print_topics(num_words = 15)
for topic in topics:
    print(topic)
    

(0, '0.161*"room" + 0.122*"bedroom" + 0.113*"privat" + 0.072*"cozi" + 0.063*"spaciou" + 0.048*"park" + 0.037*"near" + 0.036*"home" + 0.031*"larg" + 0.030*"beauti" + 0.028*"view" + 0.028*"central" + 0.025*"charm" + 0.024*"luxuri" + 0.024*"bushwick"')
(1, '0.132*"east" + 0.099*"villag" + 0.098*"modern" + 0.093*"williamsburg" + 0.072*"brownston" + 0.062*"quiet" + 0.058*"garden" + 0.047*"harlem" + 0.043*"suit" + 0.035*"bright" + 0.031*"beauti" + 0.026*"loft" + 0.026*"near" + 0.025*"cozi" + 0.021*"studio"')
(2, '0.112*"studio" + 0.089*"sunni" + 0.062*"cozi" + 0.060*"heart" + 0.053*"locat" + 0.051*"west" + 0.044*"side" + 0.041*"upper" + 0.040*"close" + 0.039*"train" + 0.039*"prime" + 0.036*"midtown" + 0.033*"time" + 0.033*"park" + 0.031*"loft"')


In [22]:
### visualize
pyLDAvis.enable_notebook()
lda_display_proc = gensimvis.prepare(ldamod_proc, corpus_fromdict_proc, text_proc_dict)
pyLDAvis.display(lda_display_proc)

# Additional summaries of topics and documents 

What if we want to find which topics are associated with higher listing prices?

In [23]:
## get topic probabilities by doc and find mean listing by topic
### get document topics - list of list tuples
topic_probs_bydoc =[ldamod_proc.get_document_topics(item) for item in corpus_fromdict_proc]

## each document has a list containing topic, probability
## tuples- example w/ first document
one_list_tup = topic_probs_bydoc[0]

## create a long for dataframe by flattening the list
topic_probs_bydoc_long = pd.DataFrame([t for lst in topic_probs_bydoc for t in lst],
                                     columns = ['topic', 'probability'])

## add id var- we're repeating each id in the original data k times
## for the number of topics
topic_probs_bydoc_long['doc_id'] = list(np.concatenate([[one_id] * 
                                    n_topics for one_id in ab_small.id]).flat)

## pivot to wide format
topic_probs_bydoc_wide = pd.pivot_table(topic_probs_bydoc_long, index = ['doc_id'],
                        columns = ['topic']).reset_index().reset_index(drop = True)
topic_probs_bydoc_wide.columns = ['doc_id'] + ["topic_" + str(i) for i in np.arange(0, n_topics)]

## merge with original data using doc id
topic_wmeta = pd.merge(topic_probs_bydoc_wide,
                      ab_small,
                      left_on = 'doc_id',
                      right_on = 'id')

## create indicator for listing's top topic
topic_wmeta['toptopic'] = topic_wmeta[[col for col in topic_wmeta.columns if 
                                    "topic_" in col]].idxmax(axis=1)
topic_wmeta.sample(n = 5, random_state = 555)

## group by topic and find mean price
topic_wmeta.groupby('toptopic').agg({'price_rawdata': np.mean})

## group by borough and topic -- higher price for some also reflects
## diff borough composition
topic_wmeta.groupby(['toptopic', 
                    'neighbourhood_group']).agg({'price_rawdata': np.mean})

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,id,neighbourhood_group,price_rawdata,name,name_lower,processed,toptopic
487,19691179,0.362415,0.252236,0.385349,19691179,Manhattan,125,Sun-filled Lower East Side Guest Room,sun-filled lower east side guest room,fill lower east side guest room,topic_2
635,24712037,0.288124,0.128003,0.583873,24712037,Brooklyn,198,Stylish Design Apartment in the Heart of Brooklyn,stylish design apartment in the heart of brooklyn,stylish design heart,topic_2
713,28165121,0.561971,0.056605,0.381424,28165121,Brooklyn,70,"Large, Sunny Retreat- B/Q subway, Prospect Park!","large, sunny retreat- b/q subway, prospect park!",larg sunni retreat subway prospect park,topic_0
262,10105477,0.287589,0.130877,0.581534,10105477,Manhattan,97,Midtown Manhattan,midtown manhattan,midtown,topic_2
941,35009512,0.528191,0.351617,0.120192,35009512,Brooklyn,123,"One bedroom in Williamsburg, Brooklyn","one bedroom in williamsburg, brooklyn",bedroom williamsburg,topic_0


Unnamed: 0_level_0,price_rawdata
toptopic,Unnamed: 1_level_1
topic_0,146.550432
topic_1,203.54902
topic_2,156.869347


Unnamed: 0_level_0,Unnamed: 1_level_0,price_rawdata
toptopic,neighbourhood_group,Unnamed: 2_level_1
topic_0,Bronx,49.2
topic_0,Brooklyn,122.79
topic_0,Manhattan,199.262963
topic_0,Queens,91.196262
topic_0,Staten Island,116.857143
topic_1,Bronx,82.5
topic_1,Brooklyn,147.976744
topic_1,Manhattan,256.592593
topic_1,Queens,126.0
topic_2,Bronx,200.0
