In [1]:
import os
import re
import numpy as np
import pandas as pd
import nltk
import plotly as py
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.figure_factory import create_distplot
import statsmodels.formula.api as smf
from statsmodels.graphics.gofplots import ProbPlot
py.offline.init_notebook_mode(connected=True)
%matplotlib inline






In [2]:
#read in the data
nyrv = pd.read_csv('./Datasets/2018_NY_March_reviews.csv')




In [3]:
nyrv.shape


(896208, 6)

In [4]:
nyrv.head()




Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2515,198,2008-10-13,2603,Jenny,Stephanie was a wonderful host! Her apartment ...
1,2515,859,2009-03-08,8455,Roland,Such a wonderful place and very close to the m...
2,2515,1083,2009-03-25,9759,Cem,I just got back from a trip to NYC during whic...
3,2515,1107,2009-03-27,9193,Holly,Stephanie's offered all the most important thi...
4,2515,2175,2009-05-09,7048,Alessandra,"Stephanie was really nice, ftiendly and helpfu..."


In [5]:
nyrv.isnull().sum()



listing_id         0
id                 0
date               0
reviewer_id        0
reviewer_name      0
comments         855
dtype: int64

In [6]:
#dropping the rows that don't have comments
nyrv = nyrv[nyrv.comments.notnull()]



In [7]:
nyrv.shape

(895353, 6)

In [8]:
nyrv.columns

Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'], dtype='object')

In [9]:
nydf = pd.read_csv('./Datasets/2018_NY_March_Cleaned_listings.csv')




Columns (0,15,27,38,43,44,49,51,55,56,61,62,65) have mixed types. Specify dtype option on import or set low_memory=False.



In [10]:
nylisting = nydf.copy()



In [11]:
nylisting.name



0                         Stay at Chez Chic budget room #1
1                       Clean & quiet apt home by the park
2                                    Skylit Midtown Castle
3                       ++ Brooklyn Penthouse Guestroom ++
4                      THE VILLAGE OF HARLEM....NEW YORK !
5                          Cozy Entire Floor of Brownstone
6           Available! Perfect location! Entire apartment!
7                                             Midtown West
8                                          BlissArtsSpace!
9                        Spacious Brooklyn Duplex + Garden
10                                Quiet Slice of Manhattan
11                        Large Furnished Room Near B'way 
12                      Cozy Clean Guest Room - Family Apt
13                      Cute & Cozy Lower East Side 1 bdrm
14                        Beautiful 1br on Upper West Side
15                      Upper West Side-Unique One Bedroom
16                         Central Manhattan/near Broadw

In [12]:
# nylisting['totaldes'] = nylisting.name + ' ' + nylisting.description + ' ' + nylisting.amenities
nylisting['totaldes'] = nylisting.description + ' ' + nylisting.amenities




## Preprocessing for NLP Modelling

In [13]:
nystring = nylisting.copy()



In [14]:
#cleaning the description
nystring.totaldes = nystring.totaldes.astype(str)
nystring.totaldes = nystring.totaldes.map(lambda x: re.sub(r'[^A-Za-z0-9-]', ' ', x))
#drop a row with id in as word string
nystring = nystring.drop([5846])
#drop a row with id as NaN
nystring = nystring.drop(nystring[nystring.id.isnull()].index)
nystring.totaldes[0]




'Step into our artistic spacious apartment and enjoy your artistic Guest room with original artwork from NY artists  Shared with my little family however we often out and you won t see us much  across the street from Central Park - the busy city minutes away but sleeping in quiet at night  -PLEASE BOOK DIRECTLY  NO NEED TO SEND A REQUEST FOR DATES CALENDAR IS UP TO DATE ALL AIRBNB RESERVATIONS WILL BE HONORED Nice  comfortable  and clean private guest room with shared bathroom  2 people max  - full size bed  In very nice apartment on central Park North 4th floor walk-up  same place as Chez chic  2  max capacity of the rooms 2 people   You will share the apt with me and my little family  Daily cleaning in common areas  Located one block from Subway 2 3 B C on 110th street  Bus M1 2 3 4 at the corner  central park across the street   Your room  full size bed  sleeps two   desk  Digital Tv DVD  wifi internet  A C  closet and desk  Sheets Towels provided  Iron air dryer provided  Separate 

In [35]:
from nltk.corpus import wordnet
#filled in nans for the listings that are not in English
nystring.totaldes = [i if not wordnet.synsets(i) else np.nan for i in nystring.totaldes]
#only select the not null rows
nystring = nystring[nystring.totaldes.notnull()]

In [42]:
#cleaning the id column
nystring.id = nystring.id.map(lambda x: str(int(x)))
nystring.id = nystring.id.map(lambda x: re.sub(r'\..*', '', x))




In [43]:
#tokenize the words
nystring.totaldes = nystring.totaldes.map(lambda x: nltk.tokenize.word_tokenize(x))




In [44]:
#import stopwords corpus
from nltk.corpus import stopwords
wordfilter = stopwords.words('english')





In [45]:
#filtering the stop words out
def stopfilter(wordlist):
    return [word for word in wordlist if word.lower() not in wordfilter]

nystring.totaldes = nystring.totaldes.map(lambda x: stopfilter(x))





In [46]:
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# from nltk.stem import PorterStemmer
# stemmer = PorterStemmer()





In [47]:
# writing a function for lemmatizing
def lemmatizetext(wordlist):
    return ' '.join([lemmatizer.lemmatize(word) for word in wordlist])

# def stemtext(wordlist):
#     return ' '.join([stemmer.stem(word) for word in wordlist])


#lemmatizing the docs
nystring.totaldes = nystring.totaldes.map(lambda x: lemmatizetext(x))






In [48]:
#selecting the columns needed
wantedlist = ['id', 'totaldes']
nydoc = nystring[wantedlist]
nydoc = nydoc[nydoc.totaldes != 'nan']
nydoc.totaldes = nydoc.totaldes.map(lambda x: x.lower())
nydoc



Unnamed: 0,id,totaldes
0,2515,step artistic spacious apartment enjoy artisti...
1,2539,renovated apt home elevator building spacious ...
2,2595,find romantic getaway beautiful spacious skyli...
3,3330,spacious clean furnished master bedroom privat...
4,3647,welcome international urban community spacious...
5,3831,urban retreat enjoy 500 f floor 1899 brownston...
6,4989,heart message availability quiet cool quirky 1...
7,5054,perfect location manhattan midtown west hell k...
8,5121,hello everyone thanks visiting bliss art space...
9,5136,welcome stay lovely 2 br baby room duplex sout...


## Build a Doc2vec Model Using Gensim

In [49]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
import multiprocessing



In [50]:
#make an additional copy for modelling purposes
nytrain = nydoc.copy()



In [51]:
#train test split
np.random.seed(0)
train_index = np.random.choice(nytrain.index, round(len(nytrain.totaldes)*0.9), replace = False)
test_index = np.array([i for i in nytrain.index if i not in train_index])
traindoc = nytrain.loc[train_index]
testdoc = nytrain.loc[test_index]



In [52]:
# reading from the data frame

class MyDocs(object):
    def __iter__(self):
        for i in range(traindoc.shape[0]):
            yield TaggedDocument(words=simple_preprocess(traindoc.iloc[i,1]), tags=[traindoc.iloc[i,0]])





In [53]:
%%time

cores = multiprocessing.cpu_count()


if not os.path.exists('./models/listingdoc2vec14.model'):
    print("start traing doc2vec model...")
    
    
    documents = MyDocs()
    doc2vec_model = Doc2Vec(dm=1, dbow_words=1, vector_size=200, window=8, min_count=20, workers=cores)
    doc2vec_model.build_vocab(documents)
    doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)
    
    
    if not os.path.exists('models'):
        os.makedirs('models')
        doc2vec_model.save('./models/listingdoc2vec14.model')
    else:
        doc2vec_model.save('./models/listingdoc2vec14.model')
else:
    doc2vec_model = Doc2Vec.load('./models/listingdoc2vec14.model')





start traing doc2vec model...
CPU times: user 1min 44s, sys: 3.78 s, total: 1min 48s
Wall time: 1min 22s


In [54]:
#search function for key works to simulate as userinput

def search(word):
    deslist = []
    for i in nytrain.totaldes:
        if word in i:
            deslist.append(i)
    return len(deslist)





In [55]:
search('pet')


10152

In [65]:
#build up userinput from search results
# userinput1 = 'Pet Allowed'
# userinput2 = 'Private Room'
# userinput3 = 'close to transit'
# userinput4 = 'close to bars'
# userinput5 = 'close to restaurant'
# userinputstr = userinput1 + ' ' + userinput2 + ' ' + userinput3 + ' ' + userinput4 + ' ' + userinput5


userinputstr = 'museum art shopping venue close subway'
userinput = userinputstr.lower().split()
userinput






['museum', 'art', 'shopping', 'venue', 'close', 'subway']

In [66]:
'''infer_vector(doc_words, alpha=0.1, min_alpha=0.0001, steps=5)
Infer a vector for given post-bulk training document.

Parameters:	
doc_words – Document should be a list of (word) tokens.
alpha (float) – The initial learning rate.
min_alpha (float) – Learning rate will linearly drop to min_alpha as training progresses.
steps (int) – Number of times to train the new document.
Returns:	
Returns the inferred vector for the new document.

Return type:	
obj: numpy.ndarray'''

userinfervec = doc2vec_model.infer_vector(userinput, alpha = 0.025, steps = 20)
userinfervec






array([-0.04282821,  0.01399806, -0.00942801, -0.00960673, -0.01244333,
        0.02223339,  0.01278641, -0.03229283,  0.00403872,  0.05350864,
        0.01362791,  0.00715162,  0.00935183, -0.01664816,  0.02310275,
        0.00178177, -0.02860916, -0.00423063, -0.01336904,  0.03966158,
       -0.02208095, -0.00685683,  0.02650885,  0.02742096,  0.01607043,
       -0.00807034, -0.00040977, -0.01848562,  0.01379892,  0.00327406,
        0.00871912, -0.03677406, -0.0248527 , -0.01171209, -0.00326014,
        0.02581477, -0.03683186, -0.00367591,  0.03449985, -0.01427505,
       -0.02490515,  0.01226493,  0.00270831,  0.01142488, -0.04886009,
        0.00295733, -0.01044742, -0.01349846, -0.01341922,  0.01021059,
        0.013609  ,  0.01168783, -0.01467723, -0.02479945,  0.02145223,
       -0.04345706,  0.01923957,  0.02485066,  0.00987666,  0.03066565,
       -0.01729358,  0.01463003,  0.01771353, -0.03279404, -0.03806274,
        0.00259941,  0.01870586,  0.02625408, -0.03308732,  0.01

In [69]:
simresult = doc2vec_model.docvecs.most_similar([userinfervec], topn=200)
simresult




[('15445777', 0.9151263236999512),
 ('15266254', 0.9098832607269287),
 ('13007629', 0.9072914123535156),
 ('3737373', 0.9069225788116455),
 ('23184876', 0.9025716781616211),
 ('4733873', 0.9010303616523743),
 ('19283022', 0.9009561538696289),
 ('10102206', 0.9007312059402466),
 ('8323765', 0.899396538734436),
 ('16798531', 0.8987194299697876),
 ('10881719', 0.8986095190048218),
 ('7809878', 0.8979320526123047),
 ('9193694', 0.8957302570343018),
 ('20227728', 0.8939012289047241),
 ('23358432', 0.8906205892562866),
 ('21771395', 0.8904698491096497),
 ('20167771', 0.88914954662323),
 ('21633235', 0.8887969851493835),
 ('19825578', 0.8886778354644775),
 ('10019335', 0.8883532285690308),
 ('21836849', 0.8877915143966675),
 ('20556267', 0.8865605592727661),
 ('18194759', 0.8861331343650818),
 ('18858192', 0.8857489824295044),
 ('7696843', 0.885594367980957),
 ('1856514', 0.8849499821662903),
 ('16455521', 0.8844437599182129),
 ('14738146', 0.8842806816101074),
 ('21886461', 0.884168267250061

In [70]:
#Show results based on tags - listing ids
result = pd.DataFrame()

for i in simresult:
    result = result.append(nystring[nystring.id == i[0]])

[(i,k) for i,k in enumerate(result.totaldes)]




[(0,
  'Midtown Studio close United Nations Turtle bay TV Wifi Air conditioning Kitchen Heating Family kid friendly Smoke detector Carbon monoxide detector Essentials Lock bedroom door Hangers Hair dryer Iron'),
 (1,
  'place good couple solo adventurer business traveler TV Wifi Air conditioning Kitchen Gym Elevator Heating Family kid friendly Essentials Hangers Hair dryer Iron'),
 (2,
  'place close Joe Shanghai love place location place good solo adventurer TV Internet Wifi Air conditioning Kitchen Heating Washer Dryer Smoke detector Essentials Shampoo Lock bedroom door Iron Laptop friendly workspace translation missing en hosting amenity 49 translation missing en hosting amenity 50'),
 (3,
  '2BDs recently renovated loft Website hidden Airbnb located heart Williamsburg coolest neighborhood Website hidden Airbnb quiet street plenty parking 1 block away L train 1 stop East Village Wifi Air conditioning Kitchen Pets allowed Buzzer wireless intercom Heating Family kid friendly Washer Dr

In [61]:
# #outputing the docvecs
# x = np.zeros((48852,200))
# for i in range(48852):
#     x[i,:] = doc2vec_model.docvecs[i]

# docvecdf = pd.DataFrame(x)
# docvecdf.to_csv('./Datasets/docvecdf.csv', index = False)



## Assessing Model

To assess our new model, we'll first infer new vectors for each document of the training corpus, compare the inferred vectors with the training corpus, and then returning the rank of the document based on self-similarity. Basically, we're pretending as if the training corpus is some new unseen data and then seeing how they compare with the trained model. The expectation is that we've likely overfit our model (i.e., all of the ranks will be less than 2) and so we should be able to find similar documents very easily. Additionally, we'll keep track of the second ranks for a comparison of less similar documents. 

In [62]:
def read_corpus(file, tokens_only=False):
    for i in range(file.shape[0]):
        if tokens_only:
            yield simple_preprocess(file.iloc[i,1])
        else:
            # For training data, add tags
            yield TaggedDocument(simple_preprocess(file.iloc[i,1]), file.iloc[i,0])



In [63]:
train_corpus = list(read_corpus(traindoc))
test_corpus = list(read_corpus(testdoc, tokens_only = True))



In [64]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = doc2vec_model.infer_vector(train_corpus[doc_id].words)
    sims = doc2vec_model.docvecs.most_similar([inferred_vector], topn=len(doc2vec_model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])




ValueError: 0 is not in list

In [None]:
for doc_id in range(len(train_corpus)):
    print(doc_id)

In [None]:
doc_id.index

In [None]:
[docid for docid, sim in sims].index(doc_id)


Let's count how each document ranks with respect to the training corpus 

In [None]:
collections.Counter(ranks)  # Results vary due to random seeding and very small corpus

Basically, greater than 95% of the inferred documents are found to be most similar to itself and about 5% of the time it is mistakenly most similar to another document. the checking of an inferred-vector against a training-vector is a sort of 'sanity check' as to whether the model is behaving in a usefully consistent manner, though not a real 'accuracy' value.

This is great and not entirely surprising. We can take a look at an example:

In [None]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))
    


Notice above that the most similar document is has a similarity score of ~80% (or higher). However, the similarity score for the second ranked documents should be significantly lower (assuming the documents are in fact different) and the reasoning becomes obvious when we examine the text itself

In [None]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the most/median/least similar documents from the train corpus
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))




## Testing the Model

Using the same approach above, we'll infer the vector for a randomly chosen test document, and compare the document to our model by eye.

In [None]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

    

