### Transform Clean Data into Sentiment Analysis

In [164]:
%reload_ext autoreload
%autoreload 2

In [165]:
import pandas as pd
import numpy as np

from src.paths import TRANSFORMED_DATA_DIR

In [166]:
# read in the clean data
data = pd.read_parquet(TRANSFORMED_DATA_DIR / 'reviews_clean.parquet')
data.head()

Unnamed: 0,verification_status,review_body,published_date,rating,aircraft,type_of_traveler,seat_type,recommended,year,month,day,cumulative_avg_rating
0,Not Verified,Very good flight following an equally good fli...,2025-01-20,9.0,A320,Solo Leisure,Business Class,yes,2025,1,20,4.66718
1,Not Verified,An hour's delay due to late arrival of the inc...,2025-01-19,7.0,A319,Family Leisure,Economy Class,yes,2025,1,19,4.666067
2,Trip Verified,I booked through BA because Loganair don’t hav...,2025-01-15,1.0,,Solo Leisure,Economy Class,no,2025,1,15,4.665468
3,Trip Verified,British airways lost bags in LHR then found th...,2025-01-09,1.0,,Family Leisure,Premium Economy,no,2025,1,9,4.66641
4,Trip Verified,The check in process and reward/loyalty progra...,2025-01-05,1.0,A320,Business,Economy Class,no,2025,1,5,4.667352


In [167]:
# we will be using review_body and recommended for our sentiment analysis
df = data[['review_body', 'recommended']]
df.head()

Unnamed: 0,review_body,recommended
0,Very good flight following an equally good fli...,yes
1,An hour's delay due to late arrival of the inc...,yes
2,I booked through BA because Loganair don’t hav...,no
3,British airways lost bags in LHR then found th...,no
4,The check in process and reward/loyalty progra...,no


In [168]:
data['review_body'][0]

'Very good flight following an equally good flight to Rome. Good food (especially on an evening flight) and aircraft early both ways. Just what short haul should be.'

In [169]:
# tail
df.tail()

Unnamed: 0,review_body,recommended
3887,Business LHR to BKK. 747-400. First try back w...,no
3890,LHR to HAM. Purser addresses all club passenge...,yes
3891,My son who had worked for British Airways urge...,no
3892,London City-New York JFK via Shannon on A318 b...,no
3893,SIN-LHR BA12 B747-436 First Class. Old aircraf...,no


In [170]:
df_copy = df.copy()

In [171]:
df_copy.replace({'yes': 1, 'no': 0}, inplace = True)
df_copy.head()

  df_copy.replace({'yes': 1, 'no': 0}, inplace = True)


Unnamed: 0,review_body,recommended
0,Very good flight following an equally good fli...,1
1,An hour's delay due to late arrival of the inc...,1
2,I booked through BA because Loganair don’t hav...,0
3,British airways lost bags in LHR then found th...,0
4,The check in process and reward/loyalty progra...,0


In [172]:
# libraries needed for our sentiment analysis
import string


In [173]:
# function to preprocess reviews
def preprocess_sentiments(reviews: pd.Series) -> pd.Series:
    cleaned = []
    for review in reviews:
        # replace the forward slash
        review = review.replace('/', ' ')
        # replace the hyphen
        review = review.replace('-', ' ')
        # replace the smart quotes
        review = review.replace('’', '')
        # remove the punctuation
        review = review.translate(str.maketrans('', '', string.punctuation))
        cleaned.append(review)
    
    cleaned_review = ''.join(cleaned)
    return cleaned_review

df_copy['cleaned'] = df['review_body'].apply(preprocess_sentiments)
df_copy.head()
        


Unnamed: 0,review_body,recommended,cleaned
0,Very good flight following an equally good fli...,1,Very good flight following an equally good fli...
1,An hour's delay due to late arrival of the inc...,1,An hours delay due to late arrival of the inco...
2,I booked through BA because Loganair don’t hav...,0,I booked through BA because Loganair dont have...
3,British airways lost bags in LHR then found th...,0,British airways lost bags in LHR then found th...
4,The check in process and reward/loyalty progra...,0,The check in process and reward loyalty progra...


In [174]:
df_copy.tail()

Unnamed: 0,review_body,recommended,cleaned
3887,Business LHR to BKK. 747-400. First try back w...,0,Business LHR to BKK 747 400 First try back wit...
3890,LHR to HAM. Purser addresses all club passenge...,1,LHR to HAM Purser addresses all club passenger...
3891,My son who had worked for British Airways urge...,0,My son who had worked for British Airways urge...
3892,London City-New York JFK via Shannon on A318 b...,0,London City New York JFK via Shannon on A318 b...
3893,SIN-LHR BA12 B747-436 First Class. Old aircraf...,0,SIN LHR BA12 B747 436 First Class Old aircraft...


In [177]:
df_copy.drop(columns = 'review_body', axis = 1, inplace = True)

In [178]:
# save the sentiment data to be used for modeling
df_copy.to_parquet(TRANSFORMED_DATA_DIR / 'cleaned_sentiments.parquet')