In [1]:
import os
import numpy as np
import pandas as pd
from time import time

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
matplotlib.rcParams['mathtext.fontset'] = 'cm'
matplotlib.rcParams['font.family'] = 'STIXGeneral'
matplotlib.rcParams["font.size"] = 26

data_dir = "/home/stavros/DATA/AirbnbReviews"
#data_dir = "D:/"

In [2]:
area = "nyc"
area_dir = os.path.join(data_dir, area)

reviews = pd.read_csv(os.path.join(area_dir, "nyc_reviews_nostopwords_en_20000samples.csv"))

print(reviews.shape)
print(reviews.columns)

(20000, 8)
Index(['Unnamed: 0', 'listing_id', 'id', 'date', 'reviewer_id',
       'reviewer_name', 'comments', 'normalized_comments'],
      dtype='object')


In [3]:
clean_reviews = reviews[pd.notnull(reviews.normalized_comments)]
print(clean_reviews.shape)

is_automated = ["canceled" not in review for review in clean_reviews.comments]
clean_reviews = clean_reviews[is_automated]
print(clean_reviews.shape)

(20000, 8)
(19739, 8)


Tokenize the documents

In [4]:
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
docs = [tokenizer.tokenize(review) for review in clean_reviews.normalized_comments]
# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

Create dictionary and bag-of-words corpus

In [6]:
from gensim import corpora, models
dictionary = corpora.Dictionary(docs)
print(len(dictionary))
#dictionary.filter_extremes(no_below=10, no_above=0.)
#print(len(dictionary))

14933


In [8]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

_ = dictionary[0]
id2word = dictionary.id2token

Check how many times each word in the dictionary appears

In [5]:
mydict = {}
for doc in docs:
    for word in doc:
        if word in mydict:
            mydict[word] += 1
        else:
            mydict[word] = 1
print(len(mydict))

14933


see that this has the same length as the dictionary created by `gensim`

In [10]:
words, appearances = [], []
for word, app in mydict.items():
    words.append(word)
    appearances.append(app)

appearances = np.array(appearances)
start = 10

apps = (appearances > start).sum()
print("Appears more than {}: {}".format(start, apps))
for i in range(10, 0, -1):
    apps = (appearances == i).sum()
    print("Appears {} times: {}".format(i, apps))

Appears more than 10: 3030
Appears 10 times: 145
Appears 9 times: 166
Appears 8 times: 174
Appears 7 times: 231
Appears 6 times: 310
Appears 5 times: 398
Appears 4 times: 598
Appears 3 times: 951
Appears 2 times: 1836
Appears 1 times: 7094


In [19]:
words_more_than10 = set(words[i] for i in np.where(appearances > 10)[0])
print(len(words_more_than10))

3030


In [40]:
"bathroom" in words_more_than10

True

Apply `gensim` LDA model

In [11]:
model = models.LdaModel(corpus=corpus, id2word=dictionary.id2token, num_topics=3, passes=20)

In [12]:
model.print_topics()

[(0,
  '0.029*"not" + 0.015*"room" + 0.012*"apartment" + 0.010*"stay" + 0.010*"place" + 0.010*"us" + 0.009*"get" + 0.008*"bed" + 0.007*"night" + 0.007*"good"'),
 (1,
  '0.040*"stay" + 0.036*"great" + 0.029*"place" + 0.023*"host" + 0.019*"apartment" + 0.017*"location" + 0.015*"would" + 0.014*"clean" + 0.014*"recommend" + 0.011*"nice"'),
 (2,
  '0.023*"subway" + 0.021*"walk" + 0.020*"restaurant" + 0.016*"great" + 0.014*"manhattan" + 0.013*"close" + 0.012*"minute" + 0.010*"station" + 0.010*"apartment" + 0.009*"park"')]