In [44]:
#Import Packages
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

In [10]:
#Import CSV
listing = pd.read_csv("data/listings.csv")
listing_exp = pd.read_csv("data/listings_exp.csv")
#calendar = pd.read_csv("data/calendar.csv")
neighbour = pd.read_csv("data/neighbourhoods.csv")
review = pd.read_csv("data/reviews.csv")
review_exp = pd.read_csv("data/reviews_exp.csv")

In [13]:
#Split and establish new columns to store comments length
review_exp['comments_len'] = review_exp['comments'].str.split().str.len()

In [14]:
#Exclude comments in different language or those with less than 5 words
review_exp = review_exp.loc[review_exp['comments_len'] > 5]

In [15]:
#Remove duplicates, if any
review_exp = review_exp.drop_duplicates(subset = ['id', 'listing_id', 'reviewer_id', 'comments_len'])

In [16]:
#Find out average # of reviews done by reviewers
#Average seems to be 1.16, so that means those that post more than 2 are key reviewers
review_exp.groupby(by=['reviewer_id']).count().sort_values(by='listing_id', ascending=False).describe()

Unnamed: 0,listing_id,id,date,reviewer_name,comments,comments_len
count,832653.0,832653.0,832653.0,832653.0,832653.0,832653.0
mean,1.16369,1.16369,1.16369,1.16369,1.16369,1.16369
std,0.755315,0.755315,0.755315,0.755315,0.755315,0.755315
min,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,1.0,1.0,1.0,1.0
50%,1.0,1.0,1.0,1.0,1.0,1.0
75%,1.0,1.0,1.0,1.0,1.0,1.0
max,92.0,92.0,92.0,92.0,92.0,92.0


In [17]:
#Group by reviewers and get those that have reviewed more than 2 times as an array
review_grouped = review_exp.groupby(by=['reviewer_id']).count().sort_values(by='listing_id', ascending=False)
key_reviewer = review_grouped[review_grouped['listing_id'] > 2].reset_index()['reviewer_id'].values

In [18]:
#Establish a new column with key reviewer value
review_exp['key_reviewer'] = np.where(review_exp['reviewer_id'].isin(key_reviewer), 1, 0)

In [19]:
review_exp

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,comments_len,key_reviewer
0,13913,80770,2010-08-18,177109,Michael,My girlfriend and I hadn't known Alina before ...,149.0,0
1,13913,367568,2011-07-11,19835707,Mathias,Alina was a really good host. The flat is clea...,32.0,0
2,13913,529579,2011-09-13,1110304,Kristin,Alina is an amazing host. She made me feel rig...,68.0,0
3,13913,595481,2011-10-03,1216358,Camilla,"Alina's place is so nice, the room is big and ...",86.0,0
4,13913,612947,2011-10-09,490840,Jorik,"Nice location in Islington area, good for shor...",17.0,0
...,...,...,...,...,...,...,...,...
1042999,53622933,510698532655225551,2021-12-05,47886779,Shameel,Gregory is an absolutely amazing host! He went...,71.0,0
1043000,53629457,509962566515134799,2021-12-04,322726852,Stella,Those considering the aptm as a last minute bo...,74.0,1
1043001,53656459,511460888308184263,2021-12-06,3765545,Harsha,One of the worst places I have ever stayed... ...,41.0,0
1043002,53657036,510753099078490860,2021-12-05,404879596,Matthew,An exceptional little apartment for a short st...,16.0,0


In [29]:
#Create new dataframe for review cleaning
reviews_cleaned = review_exp[["comments"]]
reviews_cleaned

Unnamed: 0,comments
0,My girlfriend and I hadn't known Alina before ...
1,Alina was a really good host. The flat is clea...
2,Alina is an amazing host. She made me feel rig...
3,"Alina's place is so nice, the room is big and ..."
4,"Nice location in Islington area, good for shor..."
...,...
1042999,Gregory is an absolutely amazing host! He went...
1043000,Those considering the aptm as a last minute bo...
1043001,One of the worst places I have ever stayed... ...
1043002,An exceptional little apartment for a short st...


In [30]:
#Remove Punctuation

def remove_punct(text):
    no_punctuation = "".join([characters for characters in text if characters not in string.punctuation])
    return no_punctuation

reviews_cleaned["no_punctuation"] = reviews_cleaned["comments"].apply(lambda x: remove_punct(x))

reviews_cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_cleaned["no_punctuation"] = reviews_cleaned["comments"].apply(lambda x: remove_punct(x))


Unnamed: 0,comments,no_punctuation
0,My girlfriend and I hadn't known Alina before ...,My girlfriend and I hadnt known Alina before w...
1,Alina was a really good host. The flat is clea...,Alina was a really good host The flat is clean...
2,Alina is an amazing host. She made me feel rig...,Alina is an amazing host She made me feel righ...
3,"Alina's place is so nice, the room is big and ...",Alinas place is so nice the room is big and cl...
4,"Nice location in Islington area, good for shor...",Nice location in Islington area good for short...
...,...,...
1042999,Gregory is an absolutely amazing host! He went...,Gregory is an absolutely amazing host He went ...
1043000,Those considering the aptm as a last minute bo...,Those considering the aptm as a last minute bo...
1043001,One of the worst places I have ever stayed... ...,One of the worst places I have ever stayed ver...
1043002,An exceptional little apartment for a short st...,An exceptional little apartment for a short st...


In [34]:
#Tokenize

tokenizer = RegexpTokenizer(r'\w+')

reviews_cleaned["tokenized"] = reviews_cleaned["no_punctuation"].apply(lambda x: tokenizer.tokenize(x.lower()))

reviews_cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_cleaned["tokenized"] = reviews_cleaned["no_punctuation"].apply(lambda x: tokenizer.tokenize(x.lower()))


Unnamed: 0,comments,no_punctuation,tokenized
0,My girlfriend and I hadn't known Alina before ...,My girlfriend and I hadnt known Alina before w...,"[my, girlfriend, and, i, hadnt, known, alina, ..."
1,Alina was a really good host. The flat is clea...,Alina was a really good host The flat is clean...,"[alina, was, a, really, good, host, the, flat,..."
2,Alina is an amazing host. She made me feel rig...,Alina is an amazing host She made me feel righ...,"[alina, is, an, amazing, host, she, made, me, ..."
3,"Alina's place is so nice, the room is big and ...",Alinas place is so nice the room is big and cl...,"[alinas, place, is, so, nice, the, room, is, b..."
4,"Nice location in Islington area, good for shor...",Nice location in Islington area good for short...,"[nice, location, in, islington, area, good, fo..."
...,...,...,...
1042999,Gregory is an absolutely amazing host! He went...,Gregory is an absolutely amazing host He went ...,"[gregory, is, an, absolutely, amazing, host, h..."
1043000,Those considering the aptm as a last minute bo...,Those considering the aptm as a last minute bo...,"[those, considering, the, aptm, as, a, last, m..."
1043001,One of the worst places I have ever stayed... ...,One of the worst places I have ever stayed ver...,"[one, of, the, worst, places, i, have, ever, s..."
1043002,An exceptional little apartment for a short st...,An exceptional little apartment for a short st...,"[an, exceptional, little, apartment, for, a, s..."


In [43]:
#Remove stopwords

stopword_list = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "nor", "only", "own", "same", "so", "than", "too", "very", "can", "will", "just", "should", "now"]

def remove_stopwords(word_list):
    words = [word for word in word_list if word not in stopword_list]
    return words

reviews_cleaned["removed_stopwords"] = reviews_cleaned["tokenized"].apply(lambda x: remove_stopwords(x))

reviews_cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_cleaned["removed_stopwords"] = reviews_cleaned["tokenized"].apply(lambda x: remove_stopwords(x))


Unnamed: 0,comments,no_punctuation,tokenized,removed_stopwords
0,My girlfriend and I hadn't known Alina before ...,My girlfriend and I hadnt known Alina before w...,"[my, girlfriend, and, i, hadnt, known, alina, ...","[girlfriend, hadnt, known, alina, took, leap, ..."
1,Alina was a really good host. The flat is clea...,Alina was a really good host The flat is clean...,"[alina, was, a, really, good, host, the, flat,...","[alina, really, good, host, flat, clean, tidy,..."
2,Alina is an amazing host. She made me feel rig...,Alina is an amazing host She made me feel righ...,"[alina, is, an, amazing, host, she, made, me, ...","[alina, amazing, host, made, feel, right, home..."
3,"Alina's place is so nice, the room is big and ...",Alinas place is so nice the room is big and cl...,"[alinas, place, is, so, nice, the, room, is, b...","[alinas, place, nice, room, big, clean, bed, h..."
4,"Nice location in Islington area, good for shor...",Nice location in Islington area good for short...,"[nice, location, in, islington, area, good, fo...","[nice, location, islington, area, good, short,..."
...,...,...,...,...
1042999,Gregory is an absolutely amazing host! He went...,Gregory is an absolutely amazing host He went ...,"[gregory, is, an, absolutely, amazing, host, h...","[gregory, absolutely, amazing, host, went, way..."
1043000,Those considering the aptm as a last minute bo...,Those considering the aptm as a last minute bo...,"[those, considering, the, aptm, as, a, last, m...","[considering, aptm, last, minute, booking, pri..."
1043001,One of the worst places I have ever stayed... ...,One of the worst places I have ever stayed ver...,"[one, of, the, worst, places, i, have, ever, s...","[one, worst, places, ever, stayed, disappointe..."
1043002,An exceptional little apartment for a short st...,An exceptional little apartment for a short st...,"[an, exceptional, little, apartment, for, a, s...","[exceptional, little, apartment, short, stay, ..."


In [46]:
#Lemmatize words

lemmatizer = WordNetLemmatizer()

def lemmatize_words(word_list):
    lemmatized_words = [lemmatizer.lemmatize(word) for word in word_list]
    return lemmatized_words

reviews_cleaned["lemmatized"] = reviews_cleaned["removed_stopwords"].apply(lambda x: lemmatize_words(x))

reviews_cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_cleaned["lemmatized"] = reviews_cleaned["removed_stopwords"].apply(lambda x: lemmatize_words(x))


Unnamed: 0,comments,no_punctuation,tokenized,removed_stopwords,lemmatized
0,My girlfriend and I hadn't known Alina before ...,My girlfriend and I hadnt known Alina before w...,"[my, girlfriend, and, i, hadnt, known, alina, ...","[girlfriend, hadnt, known, alina, took, leap, ...","[girlfriend, hadnt, known, alina, took, leap, ..."
1,Alina was a really good host. The flat is clea...,Alina was a really good host The flat is clean...,"[alina, was, a, really, good, host, the, flat,...","[alina, really, good, host, flat, clean, tidy,...","[alina, really, good, host, flat, clean, tidy,..."
2,Alina is an amazing host. She made me feel rig...,Alina is an amazing host She made me feel righ...,"[alina, is, an, amazing, host, she, made, me, ...","[alina, amazing, host, made, feel, right, home...","[alina, amazing, host, made, feel, right, home..."
3,"Alina's place is so nice, the room is big and ...",Alinas place is so nice the room is big and cl...,"[alinas, place, is, so, nice, the, room, is, b...","[alinas, place, nice, room, big, clean, bed, h...","[alinas, place, nice, room, big, clean, bed, h..."
4,"Nice location in Islington area, good for shor...",Nice location in Islington area good for short...,"[nice, location, in, islington, area, good, fo...","[nice, location, islington, area, good, short,...","[nice, location, islington, area, good, short,..."
...,...,...,...,...,...
1042999,Gregory is an absolutely amazing host! He went...,Gregory is an absolutely amazing host He went ...,"[gregory, is, an, absolutely, amazing, host, h...","[gregory, absolutely, amazing, host, went, way...","[gregory, absolutely, amazing, host, went, way..."
1043000,Those considering the aptm as a last minute bo...,Those considering the aptm as a last minute bo...,"[those, considering, the, aptm, as, a, last, m...","[considering, aptm, last, minute, booking, pri...","[considering, aptm, last, minute, booking, pri..."
1043001,One of the worst places I have ever stayed... ...,One of the worst places I have ever stayed ver...,"[one, of, the, worst, places, i, have, ever, s...","[one, worst, places, ever, stayed, disappointe...","[one, worst, place, ever, stayed, disappointed..."
1043002,An exceptional little apartment for a short st...,An exceptional little apartment for a short st...,"[an, exceptional, little, apartment, for, a, s...","[exceptional, little, apartment, short, stay, ...","[exceptional, little, apartment, short, stay, ..."


In [47]:
#Join Final Cleaned Comments

reviews_cleaned["comments_cleaned"] = reviews_cleaned["lemmatized"].apply(lambda x: " ".join(x))

reviews_cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_cleaned["comments_cleaned"] = reviews_cleaned["lemmatized"].apply(lambda x: " ".join(x))


Unnamed: 0,comments,no_punctuation,tokenized,removed_stopwords,lemmatized,comments_cleaned
0,My girlfriend and I hadn't known Alina before ...,My girlfriend and I hadnt known Alina before w...,"[my, girlfriend, and, i, hadnt, known, alina, ...","[girlfriend, hadnt, known, alina, took, leap, ...","[girlfriend, hadnt, known, alina, took, leap, ...",girlfriend hadnt known alina took leap faith r...
1,Alina was a really good host. The flat is clea...,Alina was a really good host The flat is clean...,"[alina, was, a, really, good, host, the, flat,...","[alina, really, good, host, flat, clean, tidy,...","[alina, really, good, host, flat, clean, tidy,...",alina really good host flat clean tidy really ...
2,Alina is an amazing host. She made me feel rig...,Alina is an amazing host She made me feel righ...,"[alina, is, an, amazing, host, she, made, me, ...","[alina, amazing, host, made, feel, right, home...","[alina, amazing, host, made, feel, right, home...",alina amazing host made feel right home like h...
3,"Alina's place is so nice, the room is big and ...",Alinas place is so nice the room is big and cl...,"[alinas, place, is, so, nice, the, room, is, b...","[alinas, place, nice, room, big, clean, bed, h...","[alinas, place, nice, room, big, clean, bed, h...",alinas place nice room big clean bed huge alin...
4,"Nice location in Islington area, good for shor...",Nice location in Islington area good for short...,"[nice, location, in, islington, area, good, fo...","[nice, location, islington, area, good, short,...","[nice, location, islington, area, good, short,...",nice location islington area good short busine...
...,...,...,...,...,...,...
1042999,Gregory is an absolutely amazing host! He went...,Gregory is an absolutely amazing host He went ...,"[gregory, is, an, absolutely, amazing, host, h...","[gregory, absolutely, amazing, host, went, way...","[gregory, absolutely, amazing, host, went, way...",gregory absolutely amazing host went way show ...
1043000,Those considering the aptm as a last minute bo...,Those considering the aptm as a last minute bo...,"[those, considering, the, aptm, as, a, last, m...","[considering, aptm, last, minute, booking, pri...","[considering, aptm, last, minute, booking, pri...",considering aptm last minute booking price con...
1043001,One of the worst places I have ever stayed... ...,One of the worst places I have ever stayed ver...,"[one, of, the, worst, places, i, have, ever, s...","[one, worst, places, ever, stayed, disappointe...","[one, worst, place, ever, stayed, disappointed...",one worst place ever stayed disappointed no be...
1043002,An exceptional little apartment for a short st...,An exceptional little apartment for a short st...,"[an, exceptional, little, apartment, for, a, s...","[exceptional, little, apartment, short, stay, ...","[exceptional, little, apartment, short, stay, ...",exceptional little apartment short stay longer...


In [48]:
review_exp["comments_cleaned"] = reviews_cleaned["comments_cleaned"]

review_exp

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,comments_len,key_reviewer,comments_cleaned
0,13913,80770,2010-08-18,177109,Michael,My girlfriend and I hadn't known Alina before ...,149.0,0,girlfriend hadnt known alina took leap faith r...
1,13913,367568,2011-07-11,19835707,Mathias,Alina was a really good host. The flat is clea...,32.0,0,alina really good host flat clean tidy really ...
2,13913,529579,2011-09-13,1110304,Kristin,Alina is an amazing host. She made me feel rig...,68.0,0,alina amazing host made feel right home like h...
3,13913,595481,2011-10-03,1216358,Camilla,"Alina's place is so nice, the room is big and ...",86.0,0,alinas place nice room big clean bed huge alin...
4,13913,612947,2011-10-09,490840,Jorik,"Nice location in Islington area, good for shor...",17.0,0,nice location islington area good short busine...
...,...,...,...,...,...,...,...,...,...
1042999,53622933,510698532655225551,2021-12-05,47886779,Shameel,Gregory is an absolutely amazing host! He went...,71.0,0,gregory absolutely amazing host went way show ...
1043000,53629457,509962566515134799,2021-12-04,322726852,Stella,Those considering the aptm as a last minute bo...,74.0,1,considering aptm last minute booking price con...
1043001,53656459,511460888308184263,2021-12-06,3765545,Harsha,One of the worst places I have ever stayed... ...,41.0,0,one worst place ever stayed disappointed no be...
1043002,53657036,510753099078490860,2021-12-05,404879596,Matthew,An exceptional little apartment for a short st...,16.0,0,exceptional little apartment short stay longer...


In [49]:
#review_exp.to_csv("reviews_cleaned.csv", index = False)