In [1]:
# Importing the required libaries
import numpy as np
import pandas as pd

# NLP libraries

from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Loading the listings data set

listings_data = pd.read_csv('/home/rreinhaus/code/rreinhaus/listings.csv')

In [6]:
# creating title and superhost dataframe file

superhost_title = listings_data[['id','host_is_superhost', 'name']] 
superhost_title = superhost_title[superhost_title['host_is_superhost'] == 't']

In [9]:
superhost_title.drop(columns=['host_is_superhost'], inplace=True)
superhost_title['title'] = superhost_title['name']
superhost_title.drop(columns=['name'], inplace=True)

In [63]:
# upgrading the stopwords seen manually

additional_words = ['flat', 'stay',
                    'would','london','br',
                    'di', 'la','b',
                   'molto','casa','il', 
                    'un','con','una',
                   'de', 'et','br','la','très',
                   'est','le','und', 'en',
                   'lovely', 'place', 'really','recommend',
                   'host', 'room','apartment','highly recommended',
                    'studio', 'house', 'bedroom', 'notting','hill','home'
                   ]

In [64]:
new_stop = additional_words + stopwords.words('english')

In [65]:
# standard cleaning function
def clean(text):
    for punctuation in string.punctuation:
        text = str(text)
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(new_stop) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    return lemmatized

In [66]:
superhost_title['clean_title']= superhost_title['title'].apply(clean)
superhost_title['clean_title'] = superhost_title['clean_title'].astype('str')

In [71]:
# Creating the model
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.6,ngram_range = (1,1))

data_vectorized = vectorizer.fit_transform(superhost_title['clean_title'])

lda_model = LatentDirichletAllocation(n_components=1)

lda_vectors = lda_model.fit_transform(data_vectorized)

In [72]:
# Checking the result
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, vectorizer)

Topic 0:
[('double', 711.349302759554), ('bed', 516.514244795802), ('central', 463.4720756516208), ('private', 448.38933604766953), ('garden', 426.5251510990988), ('spacious', 398.4714736644875), ('modern', 376.8614868647855), ('bright', 334.8275231714085), ('near', 312.5965828544845), ('cosy', 305.40396296539996)]


In [73]:
keywords = {}
for idx, topic in enumerate(lda_model.components_):
    for i in topic.argsort()[:-10 - 1:-1]:
        keywords[vectorizer.get_feature_names()[i]] = topic[i]

In [74]:
# Creating the final csv file to deploy on Google Cloud

solution = []

for key in keywords.keys():
    solution.append(key)
    
keywords_final={}
keywords_final['keywords'] = solution
nlp_title = pd.DataFrame(keywords_final) 
title_london = nlp_title.to_csv('title_london.csv')