In [1]:
# Importing the required libaries
import numpy as np
import pandas as pd

# NLP libraries
from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Loading the listings data set

listings_data = pd.read_csv('/home/rreinhaus/code/rreinhaus/listings.csv')

In [44]:
listings_data['neighborhood_overview']

0        Finsbury Park is a friendly melting pot commun...
1                                           It is Chelsea.
2        Location, location, location! You won't find b...
3        Fulham is 'villagey' and residential – a real ...
4        Barnet is one of the largest boroughs in Londo...
                               ...                        
66636                                                  NaN
66637                                                  NaN
66638                                                  NaN
66639    A two-minute walk from Notting Hill Gate stati...
66640                                                  NaN
Name: neighborhood_overview, Length: 66641, dtype: object

In [7]:
# creating description and superhost dataframe file

superhost_description = listings_data[['id','host_is_superhost', 'description', 'neighbourhood_cleansed']] 
superhost_description = superhost_description[superhost_description['host_is_superhost'] == 't']
superhost_description.drop(columns=['host_is_superhost'], inplace=True)

In [13]:
regions = []

for neighbourhood in superhost_description.neighbourhood_cleansed.unique():
    regions.append(neighbourhood)

    
df_regions = {}
for index, region in enumerate(regions):
    df_regions[region] = superhost_description[superhost_description['neighbourhood_cleansed'] == region]

{'Haringey':              id                                        description  \
 7         36660  <b>The space</b><br />This room is located on ...   
 71        84532  our flat is overlooking the river Lea and it's...   
 256      314066  Hi everyone,<br />I am Talat and I live with m...   
 327      362026  Double room in charming Edwardian house built ...   
 429      401330  I provide accommodation in a cosy and quiet fl...   
 ...         ...                                                ...   
 63790  52606076  Spacious 3 bed 2 bath flat in London, on Victo...   
 65214  53179164  We offer our lovely ground flat while we go on...   
 65322  53224833  Get comfortable and enjoy plenty of extra spac...   
 65426  53258563  A trendy, clean and spacious ground floor flat...   
 66638  53711668  A mix of modern and antique decor with 2 bed b...   
 
       neighbourhood_cleansed  
 7                   Haringey  
 71                  Haringey  
 256                 Haringey  
 327  

In [14]:
len(regions)

33

In [15]:
len(df_regions)

33

In [16]:
# upgrading the stopwords seen manually

additional_words = ['flat', 'stay',
                    'would','london','br',
                    'di', 'la','b',
                   'molto','casa','il', 
                    'un','con','una',
                   'de', 'et','br','la','très',
                   'est','le','und', 'en',
                   'lovely', 'place', 'really','recommend',
                   'host', 'room','apartment','highly recommended',
                    'studio', 'house', 'bedroom', 'notting','hill','home', 'thing','note'
                   ]

In [17]:
new_stop = additional_words + stopwords.words('english')

In [18]:
# standard cleaning function
def clean(text):
    for punctuation in string.punctuation:
        text = str(text)
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(new_stop) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    return lemmatized

In [37]:
for df in df_regions.items():
    df[1]['clean_description']= df[1]['description'].apply(clean)
    df[1]['clean_description']= df[1]['clean_description'].astype('str')
   
    # Creating the model
    from sklearn.decomposition import LatentDirichletAllocation

    vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.6,ngram_range = (2,2))

    data_vectorized = vectorizer.fit_transform(df[1]['clean_description'])

    lda_model = LatentDirichletAllocation(n_components=1)

    lda_vectors = lda_model.fit_transform(data_vectorized)
    
    keywords = {}
    for idx, topic in enumerate(lda_model.components_):
        for i in topic.argsort()[:-10 - 1:-1]:
            keywords[vectorizer.get_feature_names()[i]] = topic[i]
    
    # Creating the final csv file to deploy on Google Cloud

    solution = []

    for key in keywords.keys():
        solution.append(key)

    keywords_final={}
    keywords_final['keywords'] = solution
    nlp_description = pd.DataFrame(keywords_final) 
    description_london = nlp_description.to_csv(f'{df[0]}.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_description']= df[1]['description'].apply(clean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_description']= df[1]['clean_description'].astype('str')


# Trial Versions

In [15]:
superhost_description['clean_description']= superhost_description['description'].apply(clean)
superhost_description['clean_description'] = superhost_description['clean_description'].astype('str')

In [16]:
# Creating the model
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.6,ngram_range = (2,2))

data_vectorized = vectorizer.fit_transform(superhost_description['clean_description'])

lda_model = LatentDirichletAllocation(n_components=1)

lda_vectors = lda_model.fit_transform(data_vectorized)

In [17]:
# Checking the result
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, vectorizer)

Topic 0:
[('guest access', 500.6374922506565), ('minute walk', 438.59608879823924), ('double bed', 314.3629507036205), ('min walk', 306.3642195818883), ('fully equipped', 258.2390663521639), ('walking distance', 234.17633786158166), ('open plan', 230.99228833634507), ('tube station', 224.29974914117753), ('transport link', 196.74974283900156), ('equipped kitchen', 194.97747302840224)]


In [18]:
keywords = {}
for idx, topic in enumerate(lda_model.components_):
    for i in topic.argsort()[:-10 - 1:-1]:
        keywords[vectorizer.get_feature_names()[i]] = topic[i]

In [20]:
# Creating the final csv file to deploy on Google Cloud

solution = []

for key in keywords.keys():
    solution.append(key)
    
keywords_final={}
keywords_final['keywords'] = solution
nlp_description = pd.DataFrame(keywords_final) 
description_london = nlp_description.to_csv('description_london.csv')

In [38]:
data_sample = pd.read_csv('Camden.csv')
data_sample.columns

Index(['keywords'], dtype='object')

In [39]:
data_sample

Unnamed: 0,keywords
0,guest access
1,minute walk
2,fully equipped
3,double bed
4,camden town
5,covent garden
6,walking distance
7,hampstead heath
8,king cross
9,tube station


In [32]:
data_sample.drop(columns='Unnamed: 0', inplace=True)

In [33]:
data_sample

Unnamed: 0,keywords
0,guest access
1,minute walk
2,fully equipped
3,double bed
4,camden town
5,covent garden
6,walking distance
7,hampstead heath
8,king cross
9,tube station
