# Data Cleaning - Reviews

In [2]:
# imports required.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans

%config InlineBackend.figure_format = 'svg'
%matplotlib inline

sns.set_theme(context="notebook", style="darkgrid", palette="dark", font_scale=1.5)

In [3]:
reviews_df = pd.read_csv("../../data/raw/reviews.csv")
reviews_df.shape

(1216212, 6)

In [4]:
reviews_df.dropna(inplace=True)
reviews_df = reviews_df[~reviews_df['comments'].str.contains('This is an automated posting')]
reviews_df.shape

(1203116, 6)

In [5]:
# Remove special characters and digits
reviews_df['comments'] = reviews_df['comments'].str.replace(r'[^a-zA-Z\s]', '').str.replace(r'\d+', '')
# Convert to lowercase
reviews_df['comments'] = reviews_df['comments'].str.lower()
reviews_df.shape

  reviews_df['comments'] = reviews_df['comments'].str.replace(r'[^a-zA-Z\s]', '').str.replace(r'\d+', '')


(1203116, 6)

In [6]:
reviews_df

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,52228441,623723762668719111,2022-05-10,37052865,Kimberly,great location and the host was very responsiv...
1,52228441,505671819125096360,2021-11-28,70830110,Mahelet,duccio is a lovely and friendly host from arri...
2,52228441,466510411892882382,2021-10-05,83617224,Will,duccio is a good communicator he was very help...
3,52228441,604109461995958546,2022-04-13,2152541,Francesco,not entirely compliant to the picsbrgood locat...
4,605617198416835367,633128504578904919,2022-05-23,45418187,Waddah,great place and great host
...,...,...,...,...,...,...
1216207,14832630,184884203,2017-08-20,54407484,丽云,brbrclean and comfortable suitable for family ...
1216208,14832630,173531056,2017-07-23,130284505,Véronique,excellente situation au plein centre de londre...
1216209,14832630,153170957,2017-05-20,119296298,Nicole,todo perfecto ubicacin comodidad y rachel com...
1216210,14832630,145456398,2017-04-18,16394435,Christophe,tout dabord mme si nous navons pas eu le plais...


In [7]:
reviews_df.drop(columns=["date", "reviewer_name"], inplace=True)
reviews_df.shape

(1203116, 4)

In [8]:
reviews_df

Unnamed: 0,listing_id,id,reviewer_id,comments
0,52228441,623723762668719111,37052865,great location and the host was very responsiv...
1,52228441,505671819125096360,70830110,duccio is a lovely and friendly host from arri...
2,52228441,466510411892882382,83617224,duccio is a good communicator he was very help...
3,52228441,604109461995958546,2152541,not entirely compliant to the picsbrgood locat...
4,605617198416835367,633128504578904919,45418187,great place and great host
...,...,...,...,...
1216207,14832630,184884203,54407484,brbrclean and comfortable suitable for family ...
1216208,14832630,173531056,130284505,excellente situation au plein centre de londre...
1216209,14832630,153170957,119296298,todo perfecto ubicacin comodidad y rachel com...
1216210,14832630,145456398,16394435,tout dabord mme si nous navons pas eu le plais...


In [None]:

# Extract the comments and reviewer IDs into separate arrays
comments = reviews_df['comments'].values
reviewer_ids = reviews_df['reviewer_id'].values

# Use TfidfVectorizer to convert the comments to a TF-IDF matrix
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(comments)

# Use KMeans to cluster the reviewers based on their comments
num_clusters = 5
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)

# Create a dictionary to map reviewer IDs to cluster labels
labels = km.labels_
cluster_map = {}
for i in range(len(reviewer_ids)):
    cluster_map[reviewer_ids[i]] = labels[i]
    

# Print the resulting cluster assignments for each reviewer ID
print(cluster_map)



# Save Reviews Data to csv.

In [None]:
# Save cleaned listings data frame to csv file.
reviews_df.to_csv('../../data/interim/cleaned-reviews.csv', index=False)
reviews_df.shape