### Transform Clean Data into Sentiment Analysis

In [133]:
%reload_ext autoreload
%autoreload 2

In [134]:
import pandas as pd
import numpy as np

from src.paths import TRANSFORMED_DATA_DIR

In [135]:
# read in the clean data
data = pd.read_parquet(TRANSFORMED_DATA_DIR / 'reviews_clean.parquet')
data.head()

Unnamed: 0,verification_status,review_body,published_date,rating,aircraft,type_of_traveler,seat_type,recommended,year,month,day,cumulative_avg_rating
0,Not Verified,Very good flight following an equally good fli...,2025-01-20,9.0,A320,Solo Leisure,Business Class,yes,2025,1,20,4.66718
1,Not Verified,An hour's delay due to late arrival of the inc...,2025-01-19,7.0,A319,Family Leisure,Economy Class,yes,2025,1,19,4.666067
2,Trip Verified,I booked through BA because Loganair don’t hav...,2025-01-15,1.0,,Solo Leisure,Economy Class,no,2025,1,15,4.665468
3,Trip Verified,British airways lost bags in LHR then found th...,2025-01-09,1.0,,Family Leisure,Premium Economy,no,2025,1,9,4.66641
4,Trip Verified,The check in process and reward/loyalty progra...,2025-01-05,1.0,A320,Business,Economy Class,no,2025,1,5,4.667352


In [136]:
# we will be using review_body and recommended for our sentiment analysis
df = data[['review_body', 'recommended', 'published_date']]
df.head()

Unnamed: 0,review_body,recommended,published_date
0,Very good flight following an equally good fli...,yes,2025-01-20
1,An hour's delay due to late arrival of the inc...,yes,2025-01-19
2,I booked through BA because Loganair don’t hav...,no,2025-01-15
3,British airways lost bags in LHR then found th...,no,2025-01-09
4,The check in process and reward/loyalty progra...,no,2025-01-05


In [60]:
data['review_body'][0]

'Very good flight following an equally good flight to Rome. Good food (especially on an evening flight) and aircraft early both ways. Just what short haul should be.'

In [61]:
# tail
df.tail()

Unnamed: 0,review_body,recommended
3887,Business LHR to BKK. 747-400. First try back w...,no
3890,LHR to HAM. Purser addresses all club passenge...,yes
3891,My son who had worked for British Airways urge...,no
3892,London City-New York JFK via Shannon on A318 b...,no
3893,SIN-LHR BA12 B747-436 First Class. Old aircraf...,no


In [62]:
df[df['recommended'] == 'yes'][:10]

Unnamed: 0,review_body,recommended
0,Very good flight following an equally good fli...,yes
1,An hour's delay due to late arrival of the inc...,yes
11,Pretty good flight but still some small things...,yes
12,"Check in was fine, but no priority/fast track ...",yes
19,This flight was British Airways at its very be...,yes
26,Singapore to Heathrow with BA. Two choices on ...,yes
28,"The flight wasn’t that bad, although the Infli...",yes
31,I was very impressed with their efficient boar...,yes
38,British Airways charge you for the pleasure of...,yes
42,I recently flew from New York back to London w...,yes


In [63]:
df[df['recommended'] == 'no'][:10]

Unnamed: 0,review_body,recommended
2,I booked through BA because Loganair don’t hav...,no
3,British airways lost bags in LHR then found th...,no
4,The check in process and reward/loyalty progra...,no
5,"We flew in November 2023, but it took this lon...",no
6,I left for London from Johannesburg at 21:15 o...,no
7,After an excellent flight on a 777 CPT to LHR ...,no
8,On a recent flight from Cyprus BA621 on 23/11/...,no
9,Flight BA 0560 arrived in Rome on 11 December ...,no
10,This was the first time I flew British Airways...,no
13,British Airways is absolute rubbish. I had to ...,no


In [130]:
df_copy = df.copy()

In [65]:
# libraries needed for our sentiment analysis
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ryans\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

The first step is to remove punctuation then we will tokenize the words.

In [113]:
# remove punctuation
df_copy['no_punt'] = df_copy['review_body'].apply(lambda x: [review.translate(str.maketrans('', '', string.punctuation)) for review in x])

In [114]:
df_copy.head()

Unnamed: 0,review_body,recommended,no_punt
0,Very good flight following an equally good fli...,yes,"[V, e, r, y, , g, o, o, d, , f, l, i, g, h, ..."
1,An hour's delay due to late arrival of the inc...,yes,"[A, n, , h, o, u, r, , s, , d, e, l, a, y, ..."
2,I booked through BA because Loganair don’t hav...,no,"[I, , b, o, o, k, e, d, , t, h, r, o, u, g, ..."
3,British airways lost bags in LHR then found th...,no,"[B, r, i, t, i, s, h, , a, i, r, w, a, y, s, ..."
4,The check in process and reward/loyalty progra...,no,"[T, h, e, , c, h, e, c, k, , i, n, , p, r, ..."


In [111]:
df_copy['no_punt'][4]

['T',
 'h',
 'e',
 ' ',
 'c',
 'h',
 'e',
 'c',
 'k',
 ' ',
 'i',
 'n',
 ' ',
 'p',
 'r',
 'o',
 'c',
 'e',
 's',
 's',
 ' ',
 'a',
 'n',
 'd',
 ' ',
 'r',
 'e',
 'w',
 'a',
 'r',
 'd',
 '',
 'l',
 'o',
 'y',
 'a',
 'l',
 't',
 'y',
 ' ',
 'p',
 'r',
 'o',
 'g',
 'r',
 'a',
 'm',
 ' ',
 'i',
 's',
 ' ',
 'a',
 ' ',
 'm',
 'e',
 's',
 's',
 '',
 ' ',
 'I',
 ' ',
 'a',
 'm',
 ' ',
 'n',
 'e',
 'v',
 'e',
 'r',
 ' ',
 'a',
 'b',
 'l',
 'e',
 ' ',
 't',
 'o',
 ' ',
 'g',
 'e',
 't',
 ' ',
 'p',
 'o',
 'i',
 'n',
 't',
 's',
 '',
 ' ',
 'e',
 'v',
 'e',
 'n',
 ' ',
 'w',
 'h',
 'e',
 'n',
 ' ',
 't',
 'r',
 'y',
 'i',
 'n',
 'g',
 ' ',
 'i',
 't',
 ' ',
 't',
 'h',
 'r',
 'o',
 'u',
 'g',
 'h',
 ' ',
 'a',
 ' ',
 'p',
 'a',
 'r',
 't',
 'n',
 'e',
 'r',
 ' ',
 'a',
 'i',
 'r',
 'l',
 'i',
 'n',
 'e',
 '',
 ' ',
 'U',
 'n',
 'f',
 'o',
 'r',
 't',
 'u',
 'n',
 'a',
 't',
 'e',
 'l',
 'y',
 ' ',
 'f',
 'o',
 'r',
 ' ',
 'w',
 'o',
 'r',
 'k',
 ' ',
 'I',
 ' ',
 'h',
 'a',
 'v',
 'e',
 ' ',
 '

In [115]:
# remove smart quotes
df_copy['no_punt'] = df_copy['review_body'].apply(lambda x: [review.replace('’', '') for review in x])

In [95]:
df_copy['no_punt'][2]

['I',
 ' ',
 'b',
 'o',
 'o',
 'k',
 'e',
 'd',
 ' ',
 't',
 'h',
 'r',
 'o',
 'u',
 'g',
 'h',
 ' ',
 'B',
 'A',
 ' ',
 'b',
 'e',
 'c',
 'a',
 'u',
 's',
 'e',
 ' ',
 'L',
 'o',
 'g',
 'a',
 'n',
 'a',
 'i',
 'r',
 ' ',
 'd',
 'o',
 'n',
 '',
 't',
 ' ',
 'h',
 'a',
 'v',
 'e',
 ' ',
 'r',
 'e',
 'p',
 'r',
 'e',
 's',
 'e',
 'n',
 't',
 'a',
 't',
 'i',
 'v',
 'e',
 's',
 ' ',
 'i',
 'n',
 ' ',
 'M',
 'a',
 'n',
 'c',
 'h',
 'e',
 's',
 't',
 'e',
 'r',
 ' ',
 'a',
 'i',
 'r',
 'p',
 'o',
 'r',
 't',
 '.',
 ' ',
 'L',
 'o',
 'g',
 'a',
 'n',
 'a',
 'i',
 'r',
 ' ',
 'c',
 'a',
 'n',
 'c',
 'e',
 'l',
 'l',
 'e',
 'd',
 ' ',
 't',
 'h',
 'e',
 ' ',
 'f',
 'l',
 'i',
 'g',
 'h',
 't',
 ' ',
 'b',
 'u',
 't',
 ' ',
 's',
 'o',
 'm',
 'e',
 'o',
 'n',
 'e',
 ' ',
 'a',
 't',
 ' ',
 'B',
 'A',
 ' ',
 'm',
 'a',
 'r',
 'k',
 'e',
 'd',
 ' ',
 't',
 'h',
 'e',
 ' ',
 'f',
 'l',
 'i',
 'g',
 'h',
 't',
 ' ',
 'a',
 's',
 ' ',
 'd',
 'e',
 'l',
 'a',
 'y',
 'e',
 'd',
 ' ',
 'b',
 'y',
 ' ',

In [116]:
# join back 
df_copy['no_punt'] = df_copy['no_punt'].apply(lambda x: ''.join(x))
df_copy.head()

Unnamed: 0,review_body,recommended,no_punt
0,Very good flight following an equally good fli...,yes,Very good flight following an equally good fli...
1,An hour's delay due to late arrival of the inc...,yes,An hour's delay due to late arrival of the inc...
2,I booked through BA because Loganair don’t hav...,no,I booked through BA because Loganair dont have...
3,British airways lost bags in LHR then found th...,no,British airways lost bags in LHR then found th...
4,The check in process and reward/loyalty progra...,no,The check in process and reward/loyalty progra...


In [97]:
df_copy['no_punt'][2]

'I booked through BA because Loganair dont have representatives in Manchester airport. Loganair cancelled the flight but someone at BA marked the flight as delayed by 52 minutes. Ive spent over 70 days chasing BAs complaints department who are at best totally ineffective. They wont put you through to the people dealing with the claim (apparently they dont have the facility to do so!), they wont phone Loganair, they wont look on Manchester airports historic flight website to confirm the cancellation, they wont put you through to a supervisor in fact they do very little but apologise for how little they do.'

In [131]:
def preprocess_reviews(reviews: pd.Series) -> pd.Series:
    cleaned = []
    for review in reviews:
        # replace the forward slash
        review = review.replace('/', ' ')
        # replace the hyphen
        review = review.replace('-', ' ')
        # replace the smart quotes
        review = review.replace('’', '')
        # remove the punctuation
        review = review.translate(str.maketrans('', '', string.punctuation))
        cleaned.append(review)
    
    cleaned_review = ''.join(cleaned)
    return cleaned_review

df_copy['cleaned'] = df['review_body'].apply(preprocess_reviews)
df_copy.head()
        


Unnamed: 0,review_body,recommended,cleaned
0,Very good flight following an equally good fli...,yes,Very good flight following an equally good fli...
1,An hour's delay due to late arrival of the inc...,yes,An hours delay due to late arrival of the inco...
2,I booked through BA because Loganair don’t hav...,no,I booked through BA because Loganair dont have...
3,British airways lost bags in LHR then found th...,no,British airways lost bags in LHR then found th...
4,The check in process and reward/loyalty progra...,no,The check in process and reward loyalty progra...


In [132]:
df_copy.tail()

Unnamed: 0,review_body,recommended,cleaned
3887,Business LHR to BKK. 747-400. First try back w...,no,Business LHR to BKK 747 400 First try back wit...
3890,LHR to HAM. Purser addresses all club passenge...,yes,LHR to HAM Purser addresses all club passenger...
3891,My son who had worked for British Airways urge...,no,My son who had worked for British Airways urge...
3892,London City-New York JFK via Shannon on A318 b...,no,London City New York JFK via Shannon on A318 b...
3893,SIN-LHR BA12 B747-436 First Class. Old aircraf...,no,SIN LHR BA12 B747 436 First Class Old aircraft...


Now we have our sentiments without punctuation and we can tokenize the words. This data does not seem too messy so we can use the default tokenizer from **CountVectorizer**. 

In [None]:
# split the data into training and testing


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# initialize vectorizer
tfidf_vectorizer = TfidfVectorizer()

