# Comment Keywords Per Neighbourhood

In [13]:
# Importing the required libaries
import numpy as np
import pandas as pd
import seaborn as sns

# Getting all the data
data = pd.read_csv('/home/rreinhaus/code/rreinhaus/reviews.csv')

# Loading the listings data set

listings_data = pd.read_csv('/home/rreinhaus/code/rreinhaus/listings.csv')

In [14]:
# removing all the uncessary columns

smaller_list= listings_data[['id','host_id','host_is_superhost', 'neighbourhood_cleansed', 'number_of_reviews', 'review_scores_rating']].copy()

In [15]:
# Creating a superhost data set with at least one review

superhost_data = smaller_list[(smaller_list['number_of_reviews'] > 1) & (smaller_list['host_is_superhost'] == 't')].copy()

In [16]:
# Filtering the superhosts with at least 4.5 rating

superhost_data = superhost_data[['id','review_scores_rating','neighbourhood_cleansed']][superhost_data['review_scores_rating'] > 4.5]
superhost_data['listing_id'] = superhost_data['id']
superhost_data.drop(columns=['id'], inplace=True)

In [17]:
# making reviews dataset smaller
small_data = data[['listing_id', 'id','comments']]

In [18]:
# merging the superhost dataset with review dataset

superhost_reviews = superhost_data.merge(small_data, on='listing_id')

In [19]:
superhost_reviews

Unnamed: 0,review_scores_rating,neighbourhood_cleansed,listing_id,id,comments
0,4.85,Haringey,36660,96819,I stayed at Agri's B&B for 4 nights and it was...
1,4.85,Haringey,36660,102920,"Great b/b,great hospitality,felt like we were ..."
2,4.85,Haringey,36660,106485,We absolutely loved our time with Roger and Ag...
3,4.85,Haringey,36660,112474,"Wonderful experience, great B&B to stay at. Ag..."
4,4.85,Haringey,36660,116159,One of the best travel experience with Airbnb ...
...,...,...,...,...,...
377694,4.67,Southwark,53531883,506353018354715555,Fantastic host
377695,4.67,Southwark,53531883,508497075256836787,"Miyo's place was clean, easy to find, and in a..."
377696,4.67,Southwark,53531883,510665342365647380,"Great place! Very nice host, super clean room ..."
377697,5.00,Hackney,53570165,509962011624850771,Erica's flat is the perfect place to stay if y...


In [20]:
# NLP libraries
from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

In [21]:
# upgrading the stopwords seen manually

additional_words = ['great', 'stay',
                    'would','london','br',
                    'di', 'la','b',
                   'molto','casa','il', 
                    'un','con','una',
                   'de', 'et','br','la','très',
                   'est','le','und', 'en',
                   'lovely', 'place', 'really','recommend',
                   'host', 'room','apartment','highly recommended',
                   'everything perfect','everything','recommend','need', 'perfect', 'highly','thanks', 'thank you','next','time'
                   ]

In [22]:
# creating ultimate stop words with mutiple languages 
    
ultimate_stop = stopwords.words('english') + additional_words+ stopwords.words('french') + stopwords.words('german') + stopwords.words('italian') + stopwords.words('spanish')

In [23]:
# cleaning the text data

def clean(text):
    for punctuation in string.punctuation:
        text = str(text)
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(ultimate_stop) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    return lemmatized

In [24]:
regions = []

for neighbourhood in superhost_reviews.neighbourhood_cleansed.unique():
    regions.append(neighbourhood)
    
df_regions = {}
for index, region in enumerate(regions):
    df_regions[region] = superhost_reviews[superhost_reviews['neighbourhood_cleansed'] == region]

In [31]:
len(df_regions)

33

In [32]:
for df in df_regions.items():
    df[1]['clean_comments']= df[1]['comments'].apply(clean)
    df[1]['clean_comments']= df[1]['clean_comments'].astype('str')
   
    # Creating the model
    from sklearn.decomposition import LatentDirichletAllocation

    vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.6,ngram_range = (2,2))

    data_vectorized = vectorizer.fit_transform(df[1]['clean_comments'])

    lda_model = LatentDirichletAllocation(n_components=1)

    lda_vectors = lda_model.fit_transform(data_vectorized)
    
    keywords = {}
    for idx, topic in enumerate(lda_model.components_):
        for i in topic.argsort()[:-10 - 1:-1]:
            keywords[vectorizer.get_feature_names()[i]] = topic[i]
    
    # Creating the final csv file to deploy on Google Cloud

    solution = []

    for key in keywords.keys():
        solution.append(key)

    keywords_final={}
    keywords_final['keywords'] = solution
    nlp_description = pd.DataFrame(keywords_final) 
    description_london = nlp_description.to_csv(f'{df[0]}_comments.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_comments']= df[1]['comments'].apply(clean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_comments']= df[1]['clean_comments'].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_comments']= df[1]['comments'].apply(clean)
A value is trying to be set on

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_comments']= df[1]['comments'].apply(clean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_comments']= df[1]['clean_comments'].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_comments']= df[1]['comments'].apply(clean)
A value is trying to be set on

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_comments']= df[1]['comments'].apply(clean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_comments']= df[1]['clean_comments'].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_comments']= df[1]['comments'].apply(clean)
A value is trying to be set on

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_comments']= df[1]['comments'].apply(clean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_comments']= df[1]['clean_comments'].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_comments']= df[1]['comments'].apply(clean)
A value is trying to be set on

In [38]:
data_sample = pd.read_csv('comments_keywords/Camden_comments.csv')
data_sample

Unnamed: 0,keywords
0,tube station
1,minute walk
2,walking distance
3,king cross
4,good location
5,camden town
6,clean comfortable
7,well located
8,well equipped
9,bed comfortable


# Neighbourhood Overview Keywords

In [39]:
# creating description and superhost dataframe file

superhost_neighbourhood = listings_data[['id','host_is_superhost', 'neighborhood_overview', 'neighbourhood_cleansed']] 
superhost_neighbourhood = superhost_neighbourhood[superhost_neighbourhood['host_is_superhost'] == 't']
superhost_neighbourhood.drop(columns=['host_is_superhost'], inplace=True)

In [40]:
regions = []

for neighbourhood in superhost_neighbourhood.neighbourhood_cleansed.unique():
    regions.append(neighbourhood)

    
df_regions = {}
for index, region in enumerate(regions):
    df_regions[region] = superhost_neighbourhood[superhost_neighbourhood['neighbourhood_cleansed'] == region]

In [41]:
for df in df_regions.items():
    df[1]['clean_neighborhood_overview']= df[1]['neighborhood_overview'].apply(clean)
    df[1]['clean_neighborhood_overview']= df[1]['clean_neighborhood_overview'].astype('str')
   
    # Creating the model
    from sklearn.decomposition import LatentDirichletAllocation

    vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.6,ngram_range = (2,2))

    data_vectorized = vectorizer.fit_transform(df[1]['clean_neighborhood_overview'])

    lda_model = LatentDirichletAllocation(n_components=1)

    lda_vectors = lda_model.fit_transform(data_vectorized)
    
    keywords = {}
    for idx, topic in enumerate(lda_model.components_):
        for i in topic.argsort()[:-10 - 1:-1]:
            keywords[vectorizer.get_feature_names()[i]] = topic[i]
    
    # Creating the final csv file to deploy on Google Cloud

    solution = []

    for key in keywords.keys():
        solution.append(key)

    keywords_final={}
    keywords_final['keywords'] = solution
    nlp_description = pd.DataFrame(keywords_final) 
    description_london = nlp_description.to_csv(f'{df[0]}_neighbourhood.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_neighborhood_overview']= df[1]['neighborhood_overview'].apply(clean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_neighborhood_overview']= df[1]['clean_neighborhood_overview'].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_neighborhood_overview

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_neighborhood_overview']= df[1]['neighborhood_overview'].apply(clean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_neighborhood_overview']= df[1]['clean_neighborhood_overview'].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_neighborhood_overview

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_neighborhood_overview']= df[1]['neighborhood_overview'].apply(clean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_neighborhood_overview']= df[1]['clean_neighborhood_overview'].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_neighborhood_overview

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_neighborhood_overview']= df[1]['neighborhood_overview'].apply(clean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_neighborhood_overview']= df[1]['clean_neighborhood_overview'].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[1]['clean_neighborhood_overview