# 1. Analysis of Nashville Reviews


In [1]:
# Loading required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import string #for punctuation
from pycontractions  import Contractions
# import swifter
from emoji import replace_emoji
import re
import pattern
from pattern.en import lemma
from keybert import KeyBERT
import json
from wordcloud import WordCloud

## Reading Reviews

In [2]:
# Merge 4 quarters reviews data
df_reviews = pd.concat([pd.read_csv('data/2021-12-17/reviews1.csv',parse_dates=['date']),
                pd.read_csv('data/2022-03-20/reviews2.csv',parse_dates=['date']),
                pd.read_csv('data/2022-06-13/reviews3.csv',parse_dates=['date']),
                pd.read_csv('data/2022-09-15/reviews4.csv',parse_dates=['date'])], axis = 0 , ignore_index=True)
df_reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,6422,1927,2009-04-30,14100,Melissa,I can't say enough about how wonderful it was ...
1,6422,3867,2009-06-11,17413,Raquel,Michelle and Collier's home is wonderful! They...
2,6422,4159,2009-06-17,20253,Ulrike,I spent one night at Michele's home and felt j...
3,6422,5724,2009-07-18,22544,Phil,Michele and Collier are two of the loveliest p...
4,6422,11891,2009-09-29,33409,Claire,We had the most lovely time staying with Miche...


In [3]:
print(f'After loading shape of the dataframe: {df_reviews.shape}')

After loading shape of the dataframe: (1756645, 6)


In [4]:
# Retrieve the latest review of the listing given by each individual
df_reviews.drop_duplicates(subset=['listing_id','id','date','reviewer_id'],\
    inplace=True,ignore_index=True,keep='last')

In [5]:
print(f'After removing duplicates shape of the dataframe: {df_reviews.shape}')

After removing duplicates shape of the dataframe: (549142, 6)


**Pre-processing of the text --> to make it clean**

In [6]:
# Check for the null values in the reviews dataframe
# df_reviews[df_reviews.isna().any(axis=1)]
df_reviews.isnull().sum()

listing_id         0
id                 0
date               0
reviewer_id        0
reviewer_name      1
comments         201
dtype: int64

In [7]:
# Check the other columns value where the reviewer name is null
df_reviews[df_reviews['reviewer_name'].isnull()]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
65579,3710914,115989408,2016-11-27,24562860,,this house is amazing. My family really love t...


In [8]:
# Let us see other records with the same reviewer_id. Maybe that reviewer might have given comments to other listings
df_reviews[df_reviews['reviewer_id'] == 24562860]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
65579,3710914,115989408,2016-11-27,24562860,,this house is amazing. My family really love t...


In [9]:
# How many records are there where there is null comment
df_reviews[df_reviews['comments'].isnull()]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
936,6276671,741973821,2021-03-24,344950784,Gregory,
1021,8208036,173588705,2017-07-23,110869532,Damian,
1845,10747246,262378718,2018-05-08,17239158,Andrew,
3943,13884573,768146925,2021-06-08,22306297,Joe,
4539,15706425,731712909,2021-02-15,57774661,George,
...,...,...,...,...,...,...
531353,559383276976021385,703642146679840059,2022-08-28,349777746,Athanasios,
533615,590653850968352802,689124790736796620,2022-08-08,423756434,Melissa,
535343,606740002544154300,683290083229480594,2022-07-31,407130766,Addison,
535778,598489759351216400,683343309732917421,2022-07-31,421250044,Shelby,


In [10]:
df_reviews_1 = pd.read_csv('data/2021-12-17/reviews1.csv', parse_dates=['date'])
df_reviews_2 = pd.read_csv('data/2022-03-20/reviews2.csv' , parse_dates=['date'])
df_reviews_3 = pd.read_csv('data/2022-06-13/reviews3.csv', parse_dates=['date'])
df_reviews_4 = pd.read_csv('data/2022-09-15/reviews4.csv', parse_dates=['date'])

In [11]:
print(df_reviews_1[df_reviews_1['reviewer_id'] == 344950784])
print(df_reviews_2[df_reviews_2['reviewer_id'] == 344950784])
print(df_reviews_3[df_reviews_3['reviewer_id'] == 344950784])
print(df_reviews_4[df_reviews_4['reviewer_id'] == 344950784])


       listing_id         id       date  reviewer_id reviewer_name comments
49809     6276671  741973821 2021-03-24    344950784       Gregory      NaN
Empty DataFrame
Columns: [listing_id, id, date, reviewer_id, reviewer_name, comments]
Index: []
Empty DataFrame
Columns: [listing_id, id, date, reviewer_id, reviewer_name, comments]
Index: []
Empty DataFrame
Columns: [listing_id, id, date, reviewer_id, reviewer_name, comments]
Index: []


In [12]:
df_reviews_1[(df_reviews_1['reviewer_id'] == 110869532) & (df_reviews_1['listing_id'] ==8208036)]
# print(df_reviews_2[df_reviews_2['reviewer_id'] == 110869532 & (df_reviews_2['listing_id'] ==8208036)])
# print(df_reviews_3[df_reviews_3['reviewer_id'] == 110869532 & (df_reviews_3['listing_id'] ==8208036)])
# print(df_reviews_4[df_reviews_4['reviewer_id'] == 110869532 & (df_reviews_4['listing_id'] ==8208036)])


Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
69490,8208036,173588705,2017-07-23,110869532,Damian,
69509,8208036,235428746,2018-02-16,110869532,Damian,"Great location next to Vanderbilt, Belmont, an..."


In [13]:
# I will remove those rows from the dataframe that has null comments
df_reviews.dropna(subset = ['comments'], inplace = True)
print(f'The dataframe shape after dropping comments null values : {df_reviews.shape}')

The dataframe shape after dropping comments null values : (548941, 6)


In [19]:
# https://medium.com/@lukei_3514/dealing-with-contractions-in-nlp-d6174300876b
# Pycontractions model
cont = Contractions('GoogleNews-vectors-negative300.bin')
cont.load_models()

In [22]:
# Creating the Function to carry out the pre-processing of the text
def clean(text):
    # Convert to lowercase
    text = text.lower()

    # Remove line breaks
    text = text.replace('<br/>', '')

    # Replace emoji
    text = replace_emoji(text, '')

    # Fix contractions
    text = list(cont.expand_texts([text], precise=True))[0]

    # Remove punctuations
    text = "".join([i for i in text if i not in string.punctuation])

    # Lemmatize
    text = ' '.join([lemma(word) for word in text.split()])

    return text

In [26]:
# Apply the pre-processing function to the comments
df_reviews['clean_comments'] = df_reviews['comments'].apply(clean)

# Write to CSV to recover later --> this will have the clean comments
df_reviews.to_csv('comments.csv', index=False)

In [28]:
# Before looping through the dataframe --> df_reviews convert it into dictionary format to speed the looping process
df_dict = df_reviews.to_dict(orient='records')
# Using KeyBERT algorithm to extract keyphrases
kw_model = KeyBERT()
# Create main list containing list of keyphrases from all the comments
keyphrases = []
# This will be used to create the final dataframe
full_data_list = []
for row in df_dict: 
    # Retrieving key phrases for each review            
    k = kw_model.extract_keywords(row['clean_comments'],keyphrase_ngram_range=(1,2),stop_words='english') 
    keyphrases += list(map(lambda a : a[0], k)) 
    # Add all the columns of the reviews dataframe and add the keyphrases column to it
    full_data_list.append(list(row.values()) + [json.dumps(k)])   

df_reviews_keyphrases_data = pd.DataFrame(data=full_data_list , columns = list(df_reviews.columns) + ['keyphrases'])
df_reviews_keyphrases_data.to_csv('comments_with_keyphrases', index=False)