In [1]:
# Importing the required libaries
import numpy as np
import pandas as pd
# NLP libraries
from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Loading the listings data set

listings_data = pd.read_csv('/home/rreinhaus/code/rreinhaus/listings.csv')

In [4]:
# creating description and superhost dataframe file

superhost_description = listings_data[['id','host_is_superhost', 'description']] 
superhost_description = superhost_description[superhost_description['host_is_superhost'] == 't']
superhost_description.drop(columns=['host_is_superhost'], inplace=True)

In [12]:
# upgrading the stopwords seen manually

additional_words = ['flat', 'stay',
                    'would','london','br',
                    'di', 'la','b',
                   'molto','casa','il', 
                    'un','con','una',
                   'de', 'et','br','la','très',
                   'est','le','und', 'en',
                   'lovely', 'place', 'really','recommend',
                   'host', 'room','apartment','highly recommended',
                    'studio', 'house', 'bedroom', 'notting','hill','home', 'thing','note'
                   ]

In [13]:
new_stop = additional_words + stopwords.words('english')

In [14]:
# standard cleaning function
def clean(text):
    for punctuation in string.punctuation:
        text = str(text)
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(new_stop) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    return lemmatized

In [15]:
superhost_description['clean_description']= superhost_description['description'].apply(clean)
superhost_description['clean_description'] = superhost_description['clean_description'].astype('str')

In [16]:
# Creating the model
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.6,ngram_range = (2,2))

data_vectorized = vectorizer.fit_transform(superhost_description['clean_description'])

lda_model = LatentDirichletAllocation(n_components=1)

lda_vectors = lda_model.fit_transform(data_vectorized)

In [17]:
# Checking the result
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, vectorizer)

Topic 0:
[('guest access', 500.6374922506565), ('minute walk', 438.59608879823924), ('double bed', 314.3629507036205), ('min walk', 306.3642195818883), ('fully equipped', 258.2390663521639), ('walking distance', 234.17633786158166), ('open plan', 230.99228833634507), ('tube station', 224.29974914117753), ('transport link', 196.74974283900156), ('equipped kitchen', 194.97747302840224)]


In [18]:
keywords = {}
for idx, topic in enumerate(lda_model.components_):
    for i in topic.argsort()[:-10 - 1:-1]:
        keywords[vectorizer.get_feature_names()[i]] = topic[i]

In [20]:
# Creating the final csv file to deploy on Google Cloud

solution = []

for key in keywords.keys():
    solution.append(key)
    
keywords_final={}
keywords_final['keywords'] = solution
nlp_description = pd.DataFrame(keywords_final) 
description_london = nlp_description.to_csv('description_london.csv')