In [1]:
import numpy as np
import nltk
import pandas as pd
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import warnings
warnings.filterwarnings('ignore')

import pyLDAvis
import pyLDAvis.gensim
from nltk.corpus import stopwords

from pprint import pprint


# Import data

In [3]:
filename = "data/airbnb.csv"

df = pd.read_csv(filename)
display(df.head(5))

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,...,40.696524,-73.991617,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,https://a0.muscache.com/im/pictures/6d7cbbf7-c...,11201.0,1.0,1.0
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,...,40.766115,-73.98904,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4...,10019.0,3.0,3.0
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,...,40.80811,-73.943756,The Garden Oasis,Harlem,10,92.0,https://a0.muscache.com/im/pictures/6fae5362-9...,10027.0,1.0,3.0
3,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,...,37.772004,-122.431619,Beautiful Flat in the Heart of SF!,Lower Haight,0,,https://a0.muscache.com/im/pictures/72208dad-9...,94117.0,2.0,2.0
4,3808709,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,...,38.925627,-77.034596,Great studio in midtown DC,Columbia Heights,4,40.0,,20009.0,0.0,1.0


In [4]:
text_col = 'description'

# Class

In [7]:
class NLP():
    def preprocess(self, df, lemmatize = False):
        """
        df: text pd.Series
        
        Standard text data preprocessing on text data column
        Returns the preprocessed column
        """
        # Preprocessing
        data = df.copy(deep = True)
        # To lowercase
        data = data.apply(lambda x: x.lower())
        # Remove stop words
        stop = stopwords.words('english')
        data = data.apply(lambda x: " ".join(x for x in x.split() if x not in stop))
        if lemmatize:
            from nltk.stem.wordnet import WordNetLemmatizer 
            lem = WordNetLemmatizer()
            data = data.apply(lambda x: " ".join([lem.lemmatize(word) for word in x.split()]))
        # Remove top 10 most frequent words
        freq = pd.Series(' '.join(data).split()).value_counts()[:10]
        print("Most frequent words, removed:\n", freq)
        freq = list(freq.index)
        data = data.apply(lambda x: " ".join(x for x in x.split() if x not in freq))
        # Tokenize, to_lowercase
        data = data.apply(lambda x: gensim.utils.simple_preprocess(x))
        # Make bigram
        bigram = gensim.models.Phrases(data, min_count=5, threshold=100)
        bigram_mod = gensim.models.phrases.Phraser(bigram)
        data = data.apply(lambda x: bigram_mod[x])

        return data 
    
    def calc_length(self, df):
        """
        Compute length of text data and number of words
        """
        df['number_of_words'] = data.apply(lambda x: len(x.split(" ")))
        df['length_of_string'] = data.apply(lambda x: len(x))
    
    
    def build_lda_model(self, df, n_topics = 10):
        """
        df: text pd.Series
        n_topics: number of topics
        
        Build a LDA model, assign as a variable to self
        Print main topics
        """
        np.random.seed(10)
        sample = df.sample(len(df)//10)
        id2word = corpora.Dictionary(sample)
        corpus = [id2word.doc2bow(text) for text in sample]
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                   id2word=id2word,
                                                   num_topics=n_topics, 
                                                   random_state=100,
                                                   update_every=1,
                                                   chunksize=100,
                                                   passes=10,
                                                   alpha='auto',
                                                   per_word_topics=True)
        self.lda_model = lda_model
        self.id2word = id2word
        self.corpus = corpus
        
        pprint(lda_model.print_topics(n_topics,5))
    
    def calc_topic_distribution(self, df):
        """
        df: text pd.Series
        classify: if True, classify each doc according its most probable topic
        
        Calculates the probability of each document to belong to a topic 
        Returns df with new features
        """
        def dist2array(distribution,n_topics):
            """Auxiliary function: Returns array of topic distribution by filling in topics that have 0% probability"""
            mydict = dict(distribution)
            ret = np.array([0 if mydict.get(i)==None else  mydict.get(i) for i in range(n_topics)])
            return ret

        def get_topic_dist_text(text, model):
            """text is a string
            Returns Panda DataFrame of topic distribution"""
            """text is a panda dataframe, index = dates, value = X-th articles of day X """
            # Vectorize
            query = text
            query = model.id2word.doc2bow(query)
            test = model.get_document_topics(query)# Predict topic
            return dist2array(test,model.num_topics)

        def get_topic_dist_df(df, model):
            """df is a panda dataframe, index = dates, value = X-th articles of day X """
            td = np.zeros((len(df),model.num_topics))
            for i in range(len(df)):
                td[i] = get_topic_dist_text(df[i],model)    
            return td
        topic_dist =  get_topic_dist_df(df, self.lda_model)
        return topic_dist

nlp = NLP()

# Preprocessing


In [8]:
df['processed_text'] = nlp.preprocess(df[text_col], lemmatize = True)
display(df['processed_text'].sample(10))

Most frequent words, removed:
 room         57800
apartment    56772
bedroom      49210
private      38386
living       34564
one          34006
located      33255
minute       32941
2            32933
block        32792
dtype: int64


25605    [level, condo, baths, balcony, le, mile, beach...
23355    [sunny, julie, balcony, quiet, green, street, ...
59626    [chic, red, painted, rent, beautiful, spanish,...
60569    [big, furnished, astoria, ave, th, st, ft, fem...
21570    [整洁, 优雅的房间设计风格, 并配有网络, 洗衣机_烘干机, 尽情享受厨房自助用餐, 散步...
43945    [beautiful, townhouse, heart, silverlake, next...
73404    [great, newly, renovated, apartmentl, away, ti...
37927    [warm, homey, two, view, city, front, view, tw...
40106    [great, high, ceiling, situated, prime, locati...
62116    [place, close, south, street, seaport, wall, s...
Name: processed_text, dtype: object

# LDA

Create LDA model using random 10% of words

In [9]:
warnings.filterwarnings('ignore')
nlp.build_lda_model(df['processed_text'])

[(0,
  '0.038*"park" + 0.032*"restaurants" + 0.029*"walking" + 0.026*"distance" + '
  '0.022*"great"'),
 (1,
  '0.035*"boston" + 0.024*"suite" + 0.020*"comfortably" + 0.016*"north" + '
  '0.016*"child"'),
 (2,
  '0.027*"kitchen" + 0.027*"bed" + 0.020*"bathroom" + 0.019*"full" + '
  '0.018*"space"'),
 (3,
  '0.136*"place" + 0.054*"love" + 0.050*"you" + 0.039*"good" + 0.039*"close"'),
 (4,
  '0.027*"please" + 0.022*"guest" + 0.019*"clean" + 0.018*"free" + '
  '0.017*"check"'),
 (5,
  '0.041*"hollywood" + 0.035*"beach" + 0.025*"parking" + 0.024*"la" + '
  '0.021*"mile"'),
 (6, '0.039*"tv" + 0.025*"wifi" + 0.023*"unit" + 0.022*"fully" + 0.022*"new"'),
 (7,
  '0.032*"away" + 0.023*"min" + 0.023*"train" + 0.023*"walk" + 0.019*"subway"'),
 (8,
  '0.034*"dc" + 0.020*"include" + 0.016*"capitol" + 0.014*"white" + '
  '0.011*"design"'),
 (9,
  '0.035*"san_francisco" + 0.027*"bay" + 0.025*"harlem" + 0.017*"bridge" + '
  '0.015*"分钟"')]


### Calculate Topic distribution

In [10]:
topic_dist = nlp.calc_topic_distribution(df['processed_text'])
topic_dist[:5]

array([[0.34016129, 0.0159883 , 0.24625915, 0.02173886, 0.03498174,
        0.03460427, 0.05211523, 0.16235243, 0.01326873, 0.07853001],
       [0.15565577, 0.        , 0.30799562, 0.08517253, 0.02905184,
        0.01122754, 0.09187645, 0.28773046, 0.        , 0.02212713],
       [0.1006758 , 0.        , 0.41348863, 0.        , 0.07947843,
        0.01256281, 0.21594732, 0.10533304, 0.01488947, 0.04452284],
       [0.19895428, 0.02405594, 0.23016661, 0.01152891, 0.02576772,
        0.03257952, 0.04824295, 0.41686451, 0.        , 0.        ],
       [0.29223424, 0.        , 0.22674462, 0.05201141, 0.05869599,
        0.06174876, 0.03700904, 0.24033335, 0.02081039, 0.        ]])

### Use topic distribution as features

In [13]:
df = pd.concat([df, pd.DataFrame(topic_dist, 
                columns = ['DistTopic'+str(i) for i in range(nlp.lda_model.num_topics)])], axis = 1)
display(df.head(5))

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,DistTopic0,DistTopic1,DistTopic2,DistTopic3,DistTopic4,DistTopic5,DistTopic6,DistTopic7,DistTopic8,DistTopic9
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,...,0.340161,0.015988,0.246259,0.021739,0.034982,0.034604,0.052115,0.162352,0.013269,0.07853
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,...,0.155656,0.0,0.307996,0.085173,0.029052,0.011228,0.091876,0.28773,0.0,0.022127
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,...,0.100676,0.0,0.413489,0.0,0.079478,0.012563,0.215947,0.105333,0.014889,0.044523
3,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,...,0.198954,0.024056,0.230167,0.011529,0.025768,0.03258,0.048243,0.416865,0.0,0.0
4,3808709,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,...,0.292234,0.0,0.226745,0.052011,0.058696,0.061749,0.037009,0.240333,0.02081,0.0


### Classification using topic distribution (assigned to most likely topic)

In [14]:
df['Topic'] = np.argmax(topic_dist, axis = 1)
display(df.head(5))

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,DistTopic1,DistTopic2,DistTopic3,DistTopic4,DistTopic5,DistTopic6,DistTopic7,DistTopic8,DistTopic9,Topic
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,...,0.015988,0.246259,0.021739,0.034982,0.034604,0.052115,0.162352,0.013269,0.07853,0
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,...,0.0,0.307996,0.085173,0.029052,0.011228,0.091876,0.28773,0.0,0.022127,2
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,...,0.0,0.413489,0.0,0.079478,0.012563,0.215947,0.105333,0.014889,0.044523,2
3,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,...,0.024056,0.230167,0.011529,0.025768,0.03258,0.048243,0.416865,0.0,0.0,7
4,3808709,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,...,0.0,0.226745,0.052011,0.058696,0.061749,0.037009,0.240333,0.02081,0.0,0


### Save

In [15]:
df.to_csv(filename.split(".")[0] + "_LDA.csv")

### Example

In [16]:
for i in range(nlp.lda_model.num_topics):
    print("-------------\nTopic %i:" % i)
    print(nlp.lda_model.print_topic(i))
    print("5 examples:")
    print(df[df.Topic == i].sort_values('DistTopic'+str(i), axis =0)[text_col].sample(5).values)
    #for k in range(5):
    #    print(df.sort_values('Topic'+str(i),axis = 0, ascending = 0)[col].iloc[5])

-------------
Topic 0:
0.038*"park" + 0.032*"restaurants" + 0.029*"walking" + 0.026*"distance" + 0.022*"great" + 0.022*"restaurant" + 0.020*"neighborhood" + 0.019*"walk" + 0.019*"heart" + 0.018*"bars"
5 examples:
["Spacious 1 bedroom on Bedford Ave.   A 10 minute walk to the JMZ and the L train and one stop from Manhattan.  Located near the best bars, restaurants and cafe's. There is a huge fully stocked kitchen. The living room has a TV with apple TV access. Lots of windows with tons of natural light! Please feel free to email me with any questions. Williamsburg is a great walking neighborhood with access to tons of great shopping, restaurants and bars. 10 minutes walk to the L and JMZ trains.  B62 bus stop is directly in front of the building and will take you straight to the L. This is a walk up building so there is no elevator access."
 "Our home is in Columbia Heights and walkable to a big playground, a shopping center with Target, the 14th Street Corridor for great restaurants, a

['Large newly remodeled Edwardian home on a quiet street in sunny Mission district. Steps from Valencia St\'s trendy restaurants, galleries, & fun shops.  Just a couple blocks to Dolores Park, public transit (BART & Muni) & gourmet groceries. - Large single family classic Edwardian home  - Wood grain moldings, doors, etc.  - wood floors with Turkish & Persian rugs - 4 large, comfortable bedrooms - 2 brand new full bathrooms (as of 2016) - Huge kitchen, fully stocked with cookware, spices & other essentials - Formal dining room - outdoor deck - small yard - Free WIFI  - 60" Flat screen HD TV in Living Room  - deck & small yard - washer & dryer - board games & toys - XBox We will provide a key and instructions and will let you in or will leave key in lockbox. Located in the thriving "Valencia Street Corridor," this area has become one of San Francisco\'s most exciting neighborhoods with many of the city\'s best restaurants, most interesting shops and galleries - and fun live music venues

### Visualization

In [None]:
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(nlp.lda_model, nlp.corpus, nlp.id2word)
vis