In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

# Read Data
df = pd.read_csv('data/reviews.csv')

# Total Reviews
print(f"> Total Reviews: {df.shape[0]:,}")

# Standardise headers
df.columns = (df.columns
              .str.strip()
              .str.lower()
              .str.replace(' ', '_')
              .str.replace('-', '_')
              .str.replace('.', '_')
              .str.replace('(', '_')
              .str.replace(')', '_'))

# Columns (Isolate Reviews)
df = df[['name', 'reviews_text']].copy()

# View
display(df.head(2))

> Total Reviews: 35,912


Unnamed: 0,name,reviews_text
0,Hotel Russo Palace,Pleasant 10 min walk along the sea front to the Water Bus. restaurants etc. Hotel was comfortable breakfast was good - quite a variety. Room aircon didn't work very well. Take mosquito repelant!
1,Hotel Russo Palace,Really lovely hotel. Stayed on the very top floor and were surprised by a Jacuzzi bath we didn't know we were getting! Staff were friendly and helpful and the included breakfast was great! Great location and great value for money. Didn't want to leave!


#### HOTELS

In [2]:
# TOP 5
df['name'].value_counts().head()

name
The Alexandrian, Autograph Collection    1185
Howard Johnson Inn - Newburgh             714
Americas Best Value Inn                   567
Fiesta Inn and Suites                     546
Ip Casino Resort Spa                      392
Name: count, dtype: int64

#### FILTER: HOTEL

In [3]:
# Filter
df = df[df['name']=='The Alexandrian, Autograph Collection'].copy()

# Columns (Isolate Reviews)
df = df[['reviews_text']].copy()

# View
display(df.head())

Unnamed: 0,reviews_text
4744,The hotel was great. Staff went above and beyond bringing hot tea to room at 11:30 pm at no charge!
4745,"A wonderful hotel - would definitely stay there whenever we are in Alexandria again. Wonderful staff. We remarked that the first room was very far from the elevator, and they immediately moved us to a much nicer/larger room across from the elevator without question. I hope I am able to stay at a Kimpton Hotel on future trips."
4746,"Tolles Zimmer, nettes Personal"
4747,Outstanding. Will definitely return.
4748,"O hotel �� bel��ssimo, assim como os quartos, os quais s��o enormes e bastante confort��veis.A localiza����o �� excelente; o hotel fica na rua principal de Alexandria.H�� transfer gratuito aeroporto-hotel e vice versa.S�� ha internet free no lobby do hotel e o acesso ao computador e impressora �� pago, o que acho um inconveniente.Tivemos alguns probleminhas com o servi��o do hotel, os quais demonstraram a falta de preparo dos funcion��rios.Nos disseram que uma encomenda da Amazon n��o tinha chegado, quando na verdade j�� tinha sido entregue ha mais de 5 dias.Uma meia sumiu do banheiro e n��o nos deram qualquer satisfa����o, nem se mostraram preocupados com o ocorrido. Fomos embora sem que nada fosse resolvido.Tivemos indisposi����o ap��s um jantar no restaurante do hotel.Por essas raz��es, a estadia acabou n��o sendo satisfat��ria."


#### MISSING DATA

In [4]:
# Missing data
display(df.isna().sum())

reviews_text    0
dtype: int64

#### DUPLICATE DATA

In [5]:
# Count
duplicate_count = df.duplicated(subset=['reviews_text']).sum()
print(f"> Duplicates found: {duplicate_count}")

> Duplicates found: 21


In [6]:
# Visual Inspection
df[df.duplicated(subset=['reviews_text'], keep=False)].sort_values(by='reviews_text').head()

Unnamed: 0,reviews_text
5518,Always a pleasure. Very convenient. Well maintained. Friendly
5517,Always a pleasure. Very convenient. Well maintained. Friendly
5187,Always nice. Staff is great.
5188,Always nice. Staff is great.
5587,Excellent


In [7]:
# Original
print(f"> Original shape: {df.shape[0]:,}")

# Remove duplicates
df = df.drop_duplicates(subset=['reviews_text'], keep='first')

# Vertify
print(f"> Cleaned shape: {df.shape[0]:,}")

> Original shape: 1,185
> Cleaned shape: 1,164


#### CLEAN REVIEW TEXT / STRING COLUMNS

In [8]:
# Standardise string values
str_cols = df.select_dtypes(include=['string']).columns
df[str_cols] = df[str_cols].apply(lambda x: x.str.strip().str.lower())

# View
display(df.head())

Unnamed: 0,reviews_text
4744,the hotel was great. staff went above and beyond bringing hot tea to room at 11:30 pm at no charge!
4745,"a wonderful hotel - would definitely stay there whenever we are in alexandria again. wonderful staff. we remarked that the first room was very far from the elevator, and they immediately moved us to a much nicer/larger room across from the elevator without question. i hope i am able to stay at a kimpton hotel on future trips."
4746,"tolles zimmer, nettes personal"
4747,outstanding. will definitely return.
4748,"o hotel �� bel��ssimo, assim como os quartos, os quais s��o enormes e bastante confort��veis.a localiza����o �� excelente; o hotel fica na rua principal de alexandria.h�� transfer gratuito aeroporto-hotel e vice versa.s�� ha internet free no lobby do hotel e o acesso ao computador e impressora �� pago, o que acho um inconveniente.tivemos alguns probleminhas com o servi��o do hotel, os quais demonstraram a falta de preparo dos funcion��rios.nos disseram que uma encomenda da amazon n��o tinha chegado, quando na verdade j�� tinha sido entregue ha mais de 5 dias.uma meia sumiu do banheiro e n��o nos deram qualquer satisfa����o, nem se mostraram preocupados com o ocorrido. fomos embora sem que nada fosse resolvido.tivemos indisposi����o ap��s um jantar no restaurante do hotel.por essas raz��es, a estadia acabou n��o sendo satisfat��ria."


#### LANGUAGE

In [9]:
# LANGDETECT
from langdetect import detect

# Funtion
def isolate_language(text):
    try:
        return detect(str(text))
    except:
        return 'unknown'

# Apply
df['language'] = df['reviews_text'].apply(isolate_language)

# View
display(df.head())

Unnamed: 0,reviews_text,language
4744,the hotel was great. staff went above and beyond bringing hot tea to room at 11:30 pm at no charge!,en
4745,"a wonderful hotel - would definitely stay there whenever we are in alexandria again. wonderful staff. we remarked that the first room was very far from the elevator, and they immediately moved us to a much nicer/larger room across from the elevator without question. i hope i am able to stay at a kimpton hotel on future trips.",en
4746,"tolles zimmer, nettes personal",it
4747,outstanding. will definitely return.,en
4748,"o hotel �� bel��ssimo, assim como os quartos, os quais s��o enormes e bastante confort��veis.a localiza����o �� excelente; o hotel fica na rua principal de alexandria.h�� transfer gratuito aeroporto-hotel e vice versa.s�� ha internet free no lobby do hotel e o acesso ao computador e impressora �� pago, o que acho um inconveniente.tivemos alguns probleminhas com o servi��o do hotel, os quais demonstraram a falta de preparo dos funcion��rios.nos disseram que uma encomenda da amazon n��o tinha chegado, quando na verdade j�� tinha sido entregue ha mais de 5 dias.uma meia sumiu do banheiro e n��o nos deram qualquer satisfa����o, nem se mostraram preocupados com o ocorrido. fomos embora sem que nada fosse resolvido.tivemos indisposi����o ap��s um jantar no restaurante do hotel.por essas raz��es, a estadia acabou n��o sendo satisfat��ria.",pt


#### FILTER: LANGUAGE

In [10]:
# Filter 'en'
df = df[df['language']=='en'].copy()

# Columns (Isolate Reviews)
df = df[['reviews_text']].copy()

#### FILTER: REVIEW LENGTH

In [11]:
# Character length
df['char_count'] = df['reviews_text'].str.len()

# Filter
df = df[df['reviews_text'].str.len() >= 50].reset_index(drop=True)

# Columns (Isolate Reviews)
df = df[['reviews_text']].copy()

#### SAVE

In [12]:
# Save for LangChain ingestion
df.to_csv('Data/cleaned_reviews.csv', index=False)