In [39]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model
from sklearn import preprocessing
import matplotlib.pyplot as plt
#from haversine import haversine 
from progressbar import ProgressBar
import yaml

## for text field processing
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn import datasets, feature_extraction, decomposition
from nltk.stem.porter import PorterStemmer

import gensim
from gensim import corpora
import re
import stop_words
from stop_words import get_stop_words
import operator

## for topic visualization
import pyLDAvis
import pyLDAvis.gensim

In [40]:
listings = pd.read_csv('data/airbnb_listings.csv')

In [41]:
#Import the for loop progress bar
pbar = ProgressBar()

In [42]:
## Finding sentiment intensity score for description field
## saving "compound" score to listings dataframe

listings["Description_Length"] = np.nan
sent_int = SentimentIntensityAnalyzer()

for i in range(len(listings["description"])):
    s = listings['description'][i]
    
    ## generating sentiment intensity score for descriptions
    sentiment = sent_int.polarity_scores(s)
    listings.loc[i, "Sentiment_intensity"] = sentiment["compound"]
    
    ## Findig the length of the description
    listings.loc[i,"Description_Length"] = len(listings["description"][i].split())

In [43]:
## Topic modelling on description using LDA
stop = get_stop_words('en')
add_stop_word = ['etc', 'well', 'll', 'get', 'within, ''please', 'good', 'many', 'l', 'g', 'c', 
                 'want']
stop.extend(add_stop_word)
desc = []

## Creating a list of description
desc_list = [listings["description"][i] for i in range(len(listings["description"]))]

for i in desc_list:
    ## tokanizing descrption into words
    desc_list_w = nltk.tokenize.word_tokenize(i)
    
    ## converting all words to lower case
    desc_lower = [w.lower() for w in desc_list_w]
    
    ## removing numerical values and special characters
    desc_words = [w for w in desc_lower if re.search('^[a-z]+$',w)]
    
    ## removing stop words
    desc_stopremoved = [w for w in desc_words if w not in stop]
    
    ## add tokens to list
    desc.append(desc_stopremoved)

In [44]:
## Gensim's Dictionary encapsulates the mapping between words and their integer ids. 
desc_dict = corpora.Dictionary(desc)

## filter out words which appear in less than 20 documents or more than 15% documents
desc_dict.filter_extremes(no_below=20, no_above=0.15) 
print(desc_dict)

## creating bag of words
corpus = [desc_dict.doc2bow(d) for d in desc]
len(corpus)

Dictionary(7556 unique tokens: ['sunlit', 'brownstone', 'loveliest', 'brooklyn', 'promenade']...)


74111

In [54]:
## training an LDA model
lda_model = gensim.models.LdaModel(corpus, alpha='auto', num_topics=5, id2word=desc_dict)

In [55]:
## printing topics for lda model
lda_model.show_topics(num_topics=5,num_words=30)

[(0,
  '0.011*"brooklyn" + 0.007*"east" + 0.007*"nyc" + 0.006*"around" + 0.006*"square" + 0.006*"trains" + 0.005*"best" + 0.005*"minute" + 0.005*"shops" + 0.005*"right" + 0.005*"williamsburg" + 0.005*"cozy" + 0.005*"york" + 0.005*"min" + 0.005*"lots" + 0.005*"apt" + 0.005*"francisco" + 0.005*"village" + 0.005*"light" + 0.005*"san" + 0.005*"like" + 0.005*"easy" + 0.004*"time" + 0.004*"side" + 0.004*"central" + 0.004*"shared" + 0.004*"amazing" + 0.004*"safe" + 0.004*"everything" + 0.004*"perfect"'),
 (1,
  '0.008*"fully" + 0.007*"closet" + 0.007*"dining" + 0.007*"bedrooms" + 0.007*"use" + 0.006*"towels" + 0.006*"microwave" + 0.006*"table" + 0.006*"floors" + 0.006*"mattress" + 0.006*"cable" + 0.006*"air" + 0.006*"small" + 0.006*"bath" + 0.005*"guest" + 0.005*"sofa" + 0.005*"shower" + 0.005*"equipped" + 0.005*"unit" + 0.005*"dryer" + 0.005*"furnished" + 0.005*"shared" + 0.005*"renovated" + 0.005*"beds" + 0.005*"modern" + 0.004*"high" + 0.004*"couch" + 0.004*"desk" + 0.004*"appliances" + 0.

In [56]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, desc_dict)
pyLDAvis.show(vis)

#http://127.0.0.1:8889/#topic=0&lambda=1&term=


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [31/Oct/2018 13:54:18] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [31/Oct/2018 13:54:18] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [31/Oct/2018 13:54:18] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [31/Oct/2018 13:54:18] "GET /LDAvis.js HTTP/1.1" 200 -



stopping Server...


In [60]:
## generating topic describution over each description

listings["topic_general"] = 0.0        ## for topic 0
listings["Topic_utilities"] = 0.0      ## for topic 1
listings["Topic_Transport"] = 0.0      ## for topic 2
listings["Topic_listing_desc"] = 0.0   ## for topic 3
listings["Topic_attractions"] = 0.0    ## for topic 4

## defining topic_name to assign topic names
def topic_name(i):
        switcher={
                0:'topic_general',
                1:'Topic_utilities',
                2:'Topic_Transport',
                3:'Topic_listing_desc',
                4:'Topic_attractions',
             }
        return switcher.get(i,"Invalid")
        
for d in range(len(listings["description"])):
    bow = desc_dict.doc2bow(desc[d])
    t = lda_model.get_document_topics(bow)
    
    for i in range(len(t)):
        #print("description", d, ":", t[i][0], t[i][1])
        temp = topic_name(t[i][0])
        listings.loc[d,temp] = t[i][1]
    #print("------")

In [62]:
listings.head(-10)

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,zipcode,bedrooms,beds,Description_Length,Sentiment_intensity,topic_general,Topic_utilities,Topic_Transport,Topic_listing_desc,Topic_attractions
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,...,11201,1.0,1.0,31.0,0.8658,0.940199,0.021884,0.012032,0.013794,0.012090
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,...,10019,3.0,3.0,168.0,0.9755,0.335603,0.185079,0.448822,0.028165,0.000000
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,...,10027,1.0,3.0,167.0,0.9420,0.000000,0.822422,0.169054,0.000000,0.000000
3,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,...,94117.0,2.0,2.0,78.0,0.8513,0.968196,0.011978,0.000000,0.000000,0.000000
4,3808709,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,...,20009,0.0,1.0,115.0,0.9401,0.361290,0.000000,0.628243,0.000000,0.000000
5,12422935,4.442651,Apartment,Private room,"{TV,""Wireless Internet"",Heating,""Smoke detecto...",2,1.0,Real Bed,strict,True,...,94131,1.0,1.0,164.0,0.9564,0.482964,0.456538,0.000000,0.000000,0.055427
6,11825529,4.418841,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",3,1.0,Real Bed,moderate,True,...,90292,1.0,1.0,158.0,0.9915,0.069379,0.000000,0.000000,0.064509,0.861585
7,13971273,4.787492,Condominium,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Wheelchair...",2,1.0,Real Bed,moderate,True,...,90015,1.0,1.0,164.0,0.9089,0.000000,0.000000,0.383790,0.345340,0.263159
8,180792,4.787492,House,Private room,"{TV,""Cable TV"",""Wireless Internet"",""Pets live ...",2,1.0,Real Bed,moderate,True,...,94121,1.0,1.0,172.0,0.9913,0.048999,0.569324,0.027008,0.352264,0.000000
9,5385260,3.583519,House,Private room,"{""Wireless Internet"",""Air conditioning"",Kitche...",2,1.0,Real Bed,moderate,True,...,91748,1.0,1.0,121.0,0.6369,0.000000,0.000000,0.000000,0.975819,0.000000


In [65]:
## write df to csv file
#listings.columns
listings.to_csv("data/airbnb_listings_LDA.csv", encoding='utf-8', index=False)