In [1]:
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    # specify the path of the folder containing "file_name" :
    path_to_file = '/content/gdrive/My Drive/BT5153_Project'
    print(path_to_file)
    # change current path to the folder containing "file_name"
    os.chdir(path_to_file)
    !pwd

Mounted at /content/gdrive
/content/gdrive/My Drive/BT5153_Project
/content/gdrive/My Drive/BT5153_Project


In [70]:
!pip install texthero
!pip install langdetect
!pip install pandarallel

Collecting pandarallel
  Downloading pandarallel-1.6.1.tar.gz (12 kB)
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25l[?25hdone
  Created wheel for pandarallel: filename=pandarallel-1.6.1-py3-none-any.whl size=16265 sha256=672f440827aa0246427bae2dd0c7368a786fd9a0dfd806c925ad699883b55b12
  Stored in directory: /root/.cache/pip/wheels/e8/81/8f/29aafd7d671a4e9db4e976d71728870173f0b3f48ccba0bc32
Successfully built pandarallel
Installing collected packages: pandarallel
Successfully installed pandarallel-1.6.1


In [71]:
#Import Packages
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import texthero as hero
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from langdetect import detect
from pandarallel import pandarallel
import warnings
warnings.filterwarnings("ignore")

nltk.download('vader_lexicon')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
#Import CSV
review_exp = pd.read_csv("data/reviews_exp.csv")

In [5]:
#Split and establish new columns to store comments length
review_exp['comments_len'] = review_exp['comments'].str.split().str.len()

In [6]:
#Exclude comments in different language or those with less than 5 words
review_exp = review_exp.loc[review_exp['comments_len'] > 5]

In [7]:
#Remove duplicates, if any
review_exp = review_exp.drop_duplicates(subset = ['id', 'listing_id', 'reviewer_id', 'comments_len'])

In [8]:
#Find out average # of reviews done by reviewers
#Average seems to be 1.16, so that means those that post more than 2 are key reviewers
review_exp.groupby(by=['reviewer_id']).count().sort_values(by='listing_id', ascending=False).describe()

Unnamed: 0,listing_id,id,date,reviewer_name,comments,comments_len
count,832653.0,832653.0,832653.0,832653.0,832653.0,832653.0
mean,1.16369,1.16369,1.16369,1.16369,1.16369,1.16369
std,0.755315,0.755315,0.755315,0.755315,0.755315,0.755315
min,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,1.0,1.0,1.0,1.0
50%,1.0,1.0,1.0,1.0,1.0,1.0
75%,1.0,1.0,1.0,1.0,1.0,1.0
max,92.0,92.0,92.0,92.0,92.0,92.0


In [9]:
#Group by reviewers and get those that have reviewed more than 2 times as an array
review_grouped = review_exp.groupby(by=['reviewer_id']).count().sort_values(by='listing_id', ascending=False)
key_reviewer = review_grouped[review_grouped['listing_id'] > 2].reset_index()['reviewer_id'].values

In [10]:
#Establish a new column with key reviewer value
review_exp['key_reviewer'] = np.where(review_exp['reviewer_id'].isin(key_reviewer), 1, 0)

In [11]:
review_exp

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,comments_len,key_reviewer
0,13913,80770,2010-08-18,177109,Michael,My girlfriend and I hadn't known Alina before ...,149.0,0
1,13913,367568,2011-07-11,19835707,Mathias,Alina was a really good host. The flat is clea...,32.0,0
2,13913,529579,2011-09-13,1110304,Kristin,Alina is an amazing host. She made me feel rig...,68.0,0
3,13913,595481,2011-10-03,1216358,Camilla,"Alina's place is so nice, the room is big and ...",86.0,0
4,13913,612947,2011-10-09,490840,Jorik,"Nice location in Islington area, good for shor...",17.0,0
...,...,...,...,...,...,...,...,...
1042999,53622933,510698532655225551,2021-12-05,47886779,Shameel,Gregory is an absolutely amazing host! He went...,71.0,0
1043000,53629457,509962566515134799,2021-12-04,322726852,Stella,Those considering the aptm as a last minute bo...,74.0,1
1043001,53656459,511460888308184263,2021-12-06,3765545,Harsha,One of the worst places I have ever stayed... ...,41.0,0
1043002,53657036,510753099078490860,2021-12-05,404879596,Matthew,An exceptional little apartment for a short st...,16.0,0


In [12]:
#Create new dataframe for review cleaning
reviews_cleaned = review_exp[["comments"]]
reviews_cleaned

Unnamed: 0,comments
0,My girlfriend and I hadn't known Alina before ...
1,Alina was a really good host. The flat is clea...
2,Alina is an amazing host. She made me feel rig...
3,"Alina's place is so nice, the room is big and ..."
4,"Nice location in Islington area, good for shor..."
...,...
1042999,Gregory is an absolutely amazing host! He went...
1043000,Those considering the aptm as a last minute bo...
1043001,One of the worst places I have ever stayed... ...
1043002,An exceptional little apartment for a short st...


In [13]:
#define lemmatization function
def lemmatize_words(word_list):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in word_list]
    return lemmatized_words

In [15]:
#lemmatize words

tokenizer = RegexpTokenizer(r'\w+')

reviews_cleaned['comment_lemmatized'] = reviews_cleaned['comments'].astype(str)

reviews_cleaned['comment_lemmatized'] = reviews_cleaned['comment_lemmatized'].apply(lambda x: tokenizer.tokenize(x))

reviews_cleaned['comment_lemmatized'] = reviews_cleaned.comment_lemmatized.apply(lambda x: lemmatize_words(x))

reviews_cleaned['comment_lemmatized'] = reviews_cleaned.comment_lemmatized.apply(lambda x: " ".join(x))

reviews_cleaned

Unnamed: 0,comments,comment_lemmatized
0,My girlfriend and I hadn't known Alina before ...,My girlfriend and I hadn t known Alina before ...
1,Alina was a really good host. The flat is clea...,Alina wa a really good host The flat is clean ...
2,Alina is an amazing host. She made me feel rig...,Alina is an amazing host She made me feel righ...
3,"Alina's place is so nice, the room is big and ...",Alina s place is so nice the room is big and c...
4,"Nice location in Islington area, good for shor...",Nice location in Islington area good for short...
...,...,...
1042999,Gregory is an absolutely amazing host! He went...,Gregory is an absolutely amazing host He went ...
1043000,Those considering the aptm as a last minute bo...,Those considering the aptm a a last minute boo...
1043001,One of the worst places I have ever stayed... ...,One of the worst place I have ever stayed very...
1043002,An exceptional little apartment for a short st...,An exceptional little apartment for a short st...


In [16]:
#get sentiment function
def sent_analysis(review):
    return SentimentIntensityAnalyzer().polarity_scores(review)

In [17]:
# get sentiment polarity score

reviews_cleaned[['negative','neutral','positive','compound']] = reviews_cleaned['comment_lemmatized'].apply(sent_analysis).apply(pd.Series)

reviews_cleaned

Unnamed: 0,comments,comment_lemmatized,negative,neutral,positive,compound
0,My girlfriend and I hadn't known Alina before ...,My girlfriend and I hadn t known Alina before ...,0.019,0.770,0.212,0.9862
1,Alina was a really good host. The flat is clea...,Alina wa a really good host The flat is clean ...,0.000,0.756,0.244,0.8122
2,Alina is an amazing host. She made me feel rig...,Alina is an amazing host She made me feel righ...,0.000,0.699,0.301,0.9722
3,"Alina's place is so nice, the room is big and ...",Alina s place is so nice the room is big and c...,0.000,0.730,0.270,0.9774
4,"Nice location in Islington area, good for shor...",Nice location in Islington area good for short...,0.000,0.517,0.483,0.9027
...,...,...,...,...,...,...
1042999,Gregory is an absolutely amazing host! He went...,Gregory is an absolutely amazing host He went ...,0.000,0.688,0.312,0.9732
1043000,Those considering the aptm as a last minute bo...,Those considering the aptm a a last minute boo...,0.027,0.819,0.154,0.8946
1043001,One of the worst places I have ever stayed... ...,One of the worst place I have ever stayed very...,0.265,0.735,0.000,-0.9048
1043002,An exceptional little apartment for a short st...,An exceptional little apartment for a short st...,0.000,1.000,0.000,0.0000


In [18]:
# reviews_cleaned.to_csv('nc_sentiment.csv', index = False)

In [19]:
#check if file works
reviews_cleaned = pd.read_csv('computed_features/nc_sentiment.csv')
reviews_cleaned

Unnamed: 0,comments,comment_lemmatized,negative,neutral,positive,compound
0,My girlfriend and I hadn't known Alina before ...,My girlfriend and I hadn t known Alina before ...,0.019,0.770,0.212,0.9862
1,Alina was a really good host. The flat is clea...,Alina wa a really good host The flat is clean ...,0.000,0.756,0.244,0.8122
2,Alina is an amazing host. She made me feel rig...,Alina is an amazing host She made me feel righ...,0.000,0.699,0.301,0.9722
3,"Alina's place is so nice, the room is big and ...",Alina s place is so nice the room is big and c...,0.000,0.730,0.270,0.9774
4,"Nice location in Islington area, good for shor...",Nice location in Islington area good for short...,0.000,0.517,0.483,0.9027
...,...,...,...,...,...,...
986682,Gregory is an absolutely amazing host! He went...,Gregory is an absolutely amazing host He went ...,0.000,0.688,0.312,0.9732
986683,Those considering the aptm as a last minute bo...,Those considering the aptm a a last minute boo...,0.027,0.819,0.154,0.8946
986684,One of the worst places I have ever stayed... ...,One of the worst place I have ever stayed very...,0.265,0.735,0.000,-0.9048
986685,An exceptional little apartment for a short st...,An exceptional little apartment for a short st...,0.000,1.000,0.000,0.0000


In [21]:
#join to original dataframe

df_with_sent = review_exp.merge(reviews_cleaned, on = 'comments')
df_with_sent = df_with_sent.drop_duplicates().reset_index(drop=True)
df_with_sent

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,comments_len,key_reviewer,comment_lemmatized,negative,neutral,positive,compound
0,13913,80770,2010-08-18,177109,Michael,My girlfriend and I hadn't known Alina before ...,149.0,0,My girlfriend and I hadn t known Alina before ...,0.019,0.770,0.212,0.9862
1,13913,367568,2011-07-11,19835707,Mathias,Alina was a really good host. The flat is clea...,32.0,0,Alina wa a really good host The flat is clean ...,0.000,0.756,0.244,0.8122
2,13913,529579,2011-09-13,1110304,Kristin,Alina is an amazing host. She made me feel rig...,68.0,0,Alina is an amazing host She made me feel righ...,0.000,0.699,0.301,0.9722
3,13913,595481,2011-10-03,1216358,Camilla,"Alina's place is so nice, the room is big and ...",86.0,0,Alina s place is so nice the room is big and c...,0.000,0.730,0.270,0.9774
4,13913,612947,2011-10-09,490840,Jorik,"Nice location in Islington area, good for shor...",17.0,0,Nice location in Islington area good for short...,0.000,0.517,0.483,0.9027
...,...,...,...,...,...,...,...,...,...,...,...,...,...
961584,53622933,510698532655225551,2021-12-05,47886779,Shameel,Gregory is an absolutely amazing host! He went...,71.0,0,Gregory is an absolutely amazing host He went ...,0.000,0.688,0.312,0.9732
961585,53629457,509962566515134799,2021-12-04,322726852,Stella,Those considering the aptm as a last minute bo...,74.0,1,Those considering the aptm a a last minute boo...,0.027,0.819,0.154,0.8946
961586,53656459,511460888308184263,2021-12-06,3765545,Harsha,One of the worst places I have ever stayed... ...,41.0,0,One of the worst place I have ever stayed very...,0.265,0.735,0.000,-0.9048
961587,53657036,510753099078490860,2021-12-05,404879596,Matthew,An exceptional little apartment for a short st...,16.0,0,An exceptional little apartment for a short st...,0.000,1.000,0.000,0.0000


In [25]:
# df_with_sent.to_csv('nc_reviews_with_sentiment.csv', index = False)

In [72]:
#get different languages

pandarallel.initialize(progress_bar=False)
df_with_sent['lang'] = df_with_sent['comments'].parallel_apply(detect)
df_with_sent

INFO: Pandarallel will run on 1 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,comments_len,key_reviewer,comment_lemmatized,negative,neutral,positive,compound,sentiment,lang
0,13913,80770,2010-08-18,177109,Michael,My girlfriend and I hadn't known Alina before ...,149.0,0,My girlfriend and I hadn t known Alina before ...,0.019,0.770,0.212,0.9862,Positive,en
1,13913,367568,2011-07-11,19835707,Mathias,Alina was a really good host. The flat is clea...,32.0,0,Alina wa a really good host The flat is clean ...,0.000,0.756,0.244,0.8122,Positive,en
2,13913,529579,2011-09-13,1110304,Kristin,Alina is an amazing host. She made me feel rig...,68.0,0,Alina is an amazing host She made me feel righ...,0.000,0.699,0.301,0.9722,Positive,en
3,13913,595481,2011-10-03,1216358,Camilla,"Alina's place is so nice, the room is big and ...",86.0,0,Alina s place is so nice the room is big and c...,0.000,0.730,0.270,0.9774,Positive,en
4,13913,612947,2011-10-09,490840,Jorik,"Nice location in Islington area, good for shor...",17.0,0,Nice location in Islington area good for short...,0.000,0.517,0.483,0.9027,Positive,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
961584,53622933,510698532655225551,2021-12-05,47886779,Shameel,Gregory is an absolutely amazing host! He went...,71.0,0,Gregory is an absolutely amazing host He went ...,0.000,0.688,0.312,0.9732,Positive,en
961585,53629457,509962566515134799,2021-12-04,322726852,Stella,Those considering the aptm as a last minute bo...,74.0,1,Those considering the aptm a a last minute boo...,0.027,0.819,0.154,0.8946,Positive,en
961586,53656459,511460888308184263,2021-12-06,3765545,Harsha,One of the worst places I have ever stayed... ...,41.0,0,One of the worst place I have ever stayed very...,0.265,0.735,0.000,-0.9048,Negative,en
961587,53657036,510753099078490860,2021-12-05,404879596,Matthew,An exceptional little apartment for a short st...,16.0,0,An exceptional little apartment for a short st...,0.000,1.000,0.000,0.0000,Neutral,en


In [73]:
# df_with_sent.to_csv('nc_reviews_with_sent_and_lang.csv', index = False)

In [74]:
comment_lang = df_with_sent[['comments','lang']]
comment_lang

Unnamed: 0,comments,lang
0,My girlfriend and I hadn't known Alina before ...,en
1,Alina was a really good host. The flat is clea...,en
2,Alina is an amazing host. She made me feel rig...,en
3,"Alina's place is so nice, the room is big and ...",en
4,"Nice location in Islington area, good for shor...",en
...,...,...
961584,Gregory is an absolutely amazing host! He went...,en
961585,Those considering the aptm as a last minute bo...,en
961586,One of the worst places I have ever stayed... ...,en
961587,An exceptional little apartment for a short st...,en


In [75]:
# comment_lang.to_csv('comment_languages.csv', index = False)

In [80]:
df_with_sent['lang'].value_counts()

en       868266
fr        38034
es        17760
de        12881
it         8494
ko         4665
pt         3224
nl         2494
ru          971
da          774
sv          644
no          466
pl          439
cs          394
ca          369
fi          254
zh-cn       243
ro          170
tr          161
hu          140
af          136
el          129
he          116
ja           71
ar           66
sk           35
tl           26
hr           22
zh-tw        22
th           18
id           18
vi           16
et           12
sl           12
so           12
cy           11
bg           10
lt            4
uk            3
lv            3
sq            2
hi            1
sw            1
Name: lang, dtype: int64

In [82]:
#define ranges for different sentiments

def get_sent(sent_val):
  if sent_val > 0.05:
    sent = 'Positive'
  elif sent_val <= -0.05:
    sent = 'Negative'
  else:
    sent = 'Neutral'
  return sent

In [83]:
#get different sentiments

df_with_sent['sentiment'] = df_with_sent['compound'].apply(get_sent)
df_with_sent

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,comments_len,key_reviewer,comment_lemmatized,negative,neutral,positive,compound,sentiment,lang
0,13913,80770,2010-08-18,177109,Michael,My girlfriend and I hadn't known Alina before ...,149.0,0,My girlfriend and I hadn t known Alina before ...,0.019,0.770,0.212,0.9862,Positive,en
1,13913,367568,2011-07-11,19835707,Mathias,Alina was a really good host. The flat is clea...,32.0,0,Alina wa a really good host The flat is clean ...,0.000,0.756,0.244,0.8122,Positive,en
2,13913,529579,2011-09-13,1110304,Kristin,Alina is an amazing host. She made me feel rig...,68.0,0,Alina is an amazing host She made me feel righ...,0.000,0.699,0.301,0.9722,Positive,en
3,13913,595481,2011-10-03,1216358,Camilla,"Alina's place is so nice, the room is big and ...",86.0,0,Alina s place is so nice the room is big and c...,0.000,0.730,0.270,0.9774,Positive,en
4,13913,612947,2011-10-09,490840,Jorik,"Nice location in Islington area, good for shor...",17.0,0,Nice location in Islington area good for short...,0.000,0.517,0.483,0.9027,Positive,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
961584,53622933,510698532655225551,2021-12-05,47886779,Shameel,Gregory is an absolutely amazing host! He went...,71.0,0,Gregory is an absolutely amazing host He went ...,0.000,0.688,0.312,0.9732,Positive,en
961585,53629457,509962566515134799,2021-12-04,322726852,Stella,Those considering the aptm as a last minute bo...,74.0,1,Those considering the aptm a a last minute boo...,0.027,0.819,0.154,0.8946,Positive,en
961586,53656459,511460888308184263,2021-12-06,3765545,Harsha,One of the worst places I have ever stayed... ...,41.0,0,One of the worst place I have ever stayed very...,0.265,0.735,0.000,-0.9048,Negative,en
961587,53657036,510753099078490860,2021-12-05,404879596,Matthew,An exceptional little apartment for a short st...,16.0,0,An exceptional little apartment for a short st...,0.000,1.000,0.000,0.0000,Neutral,en


In [85]:
df_eng = df_with_sent[df_with_sent['lang'] == 'en']
df_eng

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,comments_len,key_reviewer,comment_lemmatized,negative,neutral,positive,compound,sentiment,lang
0,13913,80770,2010-08-18,177109,Michael,My girlfriend and I hadn't known Alina before ...,149.0,0,My girlfriend and I hadn t known Alina before ...,0.019,0.770,0.212,0.9862,Positive,en
1,13913,367568,2011-07-11,19835707,Mathias,Alina was a really good host. The flat is clea...,32.0,0,Alina wa a really good host The flat is clean ...,0.000,0.756,0.244,0.8122,Positive,en
2,13913,529579,2011-09-13,1110304,Kristin,Alina is an amazing host. She made me feel rig...,68.0,0,Alina is an amazing host She made me feel righ...,0.000,0.699,0.301,0.9722,Positive,en
3,13913,595481,2011-10-03,1216358,Camilla,"Alina's place is so nice, the room is big and ...",86.0,0,Alina s place is so nice the room is big and c...,0.000,0.730,0.270,0.9774,Positive,en
4,13913,612947,2011-10-09,490840,Jorik,"Nice location in Islington area, good for shor...",17.0,0,Nice location in Islington area good for short...,0.000,0.517,0.483,0.9027,Positive,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
961584,53622933,510698532655225551,2021-12-05,47886779,Shameel,Gregory is an absolutely amazing host! He went...,71.0,0,Gregory is an absolutely amazing host He went ...,0.000,0.688,0.312,0.9732,Positive,en
961585,53629457,509962566515134799,2021-12-04,322726852,Stella,Those considering the aptm as a last minute bo...,74.0,1,Those considering the aptm a a last minute boo...,0.027,0.819,0.154,0.8946,Positive,en
961586,53656459,511460888308184263,2021-12-06,3765545,Harsha,One of the worst places I have ever stayed... ...,41.0,0,One of the worst place I have ever stayed very...,0.265,0.735,0.000,-0.9048,Negative,en
961587,53657036,510753099078490860,2021-12-05,404879596,Matthew,An exceptional little apartment for a short st...,16.0,0,An exceptional little apartment for a short st...,0.000,1.000,0.000,0.0000,Neutral,en


In [86]:
df_eng['sentiment'].value_counts()

Positive    834955
Neutral      21397
Negative     11914
Name: sentiment, dtype: int64

In [87]:
# df_eng = df_sentiment.groupby('listing_id').mean()
# df_eng = df_sentiment.reset_index()
# df_eng.to_csv('nc_eng_with_sent.csv', index = False)