In [38]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import re

In [2]:
df = pd.read_csv("BA_reviews.csv", index_col=0)
df

Unnamed: 0,reviews,stars,date,country
0,Not Verified | The customer service is one of...,5.0,18th August 2024,United States
1,"Not Verified | Before my flight, I was forced ...",2.0,15th August 2024,United Kingdom
2,✅ Trip Verified | British Airways at its bes...,1.0,12th August 2024,United Kingdom
3,✅ Trip Verified | An excellent flight! Despite...,8.0,12th August 2024,Lebanon
4,✅ Trip Verified | I recently traveled with Bri...,8.0,11th August 2024,United States
...,...,...,...,...
3837,YYZ to LHR - July 2012 - I flew overnight in p...,9.0,29th August 2012,Canada
3838,LHR to HAM. Purser addresses all club passenge...,5.0,28th August 2012,United Kingdom
3839,My son who had worked for British Airways urge...,8.0,12th October 2011,United Kingdom
3840,London City-New York JFK via Shannon on A318 b...,2.0,11th October 2011,United States


In [3]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,Not Verified | The customer service is one of...,5.0,18th August 2024,United States
1,"Not Verified | Before my flight, I was forced ...",2.0,15th August 2024,United Kingdom
2,✅ Trip Verified | British Airways at its bes...,1.0,12th August 2024,United Kingdom
3,✅ Trip Verified | An excellent flight! Despite...,8.0,12th August 2024,Lebanon
4,✅ Trip Verified | I recently traveled with Bri...,8.0,11th August 2024,United States


In [4]:
df.shape

(3842, 4)

In [5]:
df['verified'] = df.reviews.str.contains("Trip Verified")
df['verified']

0       False
1       False
2        True
3        True
4        True
        ...  
3837    False
3838    False
3839    False
3840    False
3841    False
Name: verified, Length: 3842, dtype: bool

In [6]:
df.shape

(3842, 5)

In [8]:
import nltk
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rakibulhasan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rakibulhasan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()

reviews_data = df.reviews.str.strip("✅ Trip Verified |")

corpus = []

for rev in reviews_data:
    rev = re.sub("[^a-zA-Z]", " ", rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

In [10]:
df['corpus'] = corpus

In [11]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | The customer service is one of...,5.0,18th August 2024,United States,False,verified customer service one worst ever seen ...
1,"Not Verified | Before my flight, I was forced ...",2.0,15th August 2024,United Kingdom,False,verified flight forced ground staff check new ...
2,✅ Trip Verified | British Airways at its bes...,1.0,12th August 2024,United Kingdom,True,british airway best outstanding service flight...
3,✅ Trip Verified | An excellent flight! Despite...,8.0,12th August 2024,Lebanon,True,excellent flight despite hour flight configure...
4,✅ Trip Verified | I recently traveled with Bri...,8.0,11th August 2024,United States,True,recently traveled british airway mixed experie...


In [12]:
df.shape

(3842, 6)

In [13]:
df.dtypes

reviews      object
stars       float64
date         object
country      object
verified       bool
corpus       object
dtype: object

In [14]:
df.date = pd.to_datetime(df.date, format='mixed')

In [15]:
df.date.head()

0   2024-08-18
1   2024-08-15
2   2024-08-12
3   2024-08-12
4   2024-08-11
Name: date, dtype: datetime64[ns]

In [16]:
df.dtypes

reviews             object
stars              float64
date        datetime64[ns]
country             object
verified              bool
corpus              object
dtype: object

In [17]:
df.stars.unique()

array([ 5.,  2.,  1.,  8.,  3., 10.,  6.,  9.,  7.,  4., nan])

In [18]:
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     3834
         True   False  False    False     False        5
         False  False  True     False     False        3
Name: count, dtype: int64

In [19]:
df.stars.value_counts()

stars
1.0     912
2.0     437
3.0     423
8.0     377
10.0    332
7.0     316
9.0     315
5.0     276
4.0     251
6.0     198
Name: count, dtype: int64

In [20]:
df.isnull().sum()

reviews     0
stars       5
date        0
country     3
verified    0
corpus      0
dtype: int64

In [21]:
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     3834
         True   False  False    False     False        5
         False  False  True     False     False        3
Name: count, dtype: int64

In [22]:
df.drop(df[df.stars.isnull() == True].index, axis=0,inplace=True)
df.stars.isnull().sum()

np.int64(0)

In [23]:
df.stars.unique()

array([ 5.,  2.,  1.,  8.,  3., 10.,  6.,  9.,  7.,  4.])

In [24]:
df.isnull().sum()

reviews     0
stars       0
date        0
country     3
verified    0
corpus      0
dtype: int64

In [25]:
df.country.isnull().sum()

np.int64(3)

In [26]:
df.country.isnull().value_counts()

country
False    3834
True        3
Name: count, dtype: int64

In [27]:
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [28]:
df.isnull().sum()

reviews     0
stars       0
date        0
country     0
verified    0
corpus      0
dtype: int64

In [29]:
df.dtypes

reviews             object
stars              float64
date        datetime64[ns]
country             object
verified              bool
corpus              object
dtype: object

In [30]:
df.country.value_counts()

country
United Kingdom           2400
United States             439
Australia                 163
Canada                    123
Germany                    67
                         ... 
Costa Rica                  1
Cayman Islands              1
Saint Kitts and Nevis       1
Vietnam                     1
Oman                        1
Name: count, Length: 73, dtype: int64

In [31]:
df.shape

(3834, 6)

In [32]:
df.reset_index(drop=True, inplace=True)

In [33]:
df

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | The customer service is one of...,5.0,2024-08-18,United States,False,verified customer service one worst ever seen ...
1,"Not Verified | Before my flight, I was forced ...",2.0,2024-08-15,United Kingdom,False,verified flight forced ground staff check new ...
2,✅ Trip Verified | British Airways at its bes...,1.0,2024-08-12,United Kingdom,True,british airway best outstanding service flight...
3,✅ Trip Verified | An excellent flight! Despite...,8.0,2024-08-12,Lebanon,True,excellent flight despite hour flight configure...
4,✅ Trip Verified | I recently traveled with Bri...,8.0,2024-08-11,United States,True,recently traveled british airway mixed experie...
...,...,...,...,...,...,...
3829,YYZ to LHR - July 2012 - I flew overnight in p...,9.0,2012-08-29,Canada,False,yyz lhr july flew overnight premium economy ch...
3830,LHR to HAM. Purser addresses all club passenge...,5.0,2012-08-28,United Kingdom,False,lhr ham purser address club passenger name boa...
3831,My son who had worked for British Airways urge...,8.0,2011-10-12,United Kingdom,False,son worked british airway urged fly british ai...
3832,London City-New York JFK via Shannon on A318 b...,2.0,2011-10-11,United States,False,london city new york jfk via shannon really ni...


In [34]:
df.to_csv("Cleaned_BA_reviews_1.csv")

In [35]:
df.shape

(3834, 6)

In [36]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | The customer service is one of...,5.0,2024-08-18,United States,False,verified customer service one worst ever seen ...
1,"Not Verified | Before my flight, I was forced ...",2.0,2024-08-15,United Kingdom,False,verified flight forced ground staff check new ...
2,✅ Trip Verified | British Airways at its bes...,1.0,2024-08-12,United Kingdom,True,british airway best outstanding service flight...
3,✅ Trip Verified | An excellent flight! Despite...,8.0,2024-08-12,Lebanon,True,excellent flight despite hour flight configure...
4,✅ Trip Verified | I recently traveled with Bri...,8.0,2024-08-11,United States,True,recently traveled british airway mixed experie...


In [37]:
df

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | The customer service is one of...,5.0,2024-08-18,United States,False,verified customer service one worst ever seen ...
1,"Not Verified | Before my flight, I was forced ...",2.0,2024-08-15,United Kingdom,False,verified flight forced ground staff check new ...
2,✅ Trip Verified | British Airways at its bes...,1.0,2024-08-12,United Kingdom,True,british airway best outstanding service flight...
3,✅ Trip Verified | An excellent flight! Despite...,8.0,2024-08-12,Lebanon,True,excellent flight despite hour flight configure...
4,✅ Trip Verified | I recently traveled with Bri...,8.0,2024-08-11,United States,True,recently traveled british airway mixed experie...
...,...,...,...,...,...,...
3829,YYZ to LHR - July 2012 - I flew overnight in p...,9.0,2012-08-29,Canada,False,yyz lhr july flew overnight premium economy ch...
3830,LHR to HAM. Purser addresses all club passenge...,5.0,2012-08-28,United Kingdom,False,lhr ham purser address club passenger name boa...
3831,My son who had worked for British Airways urge...,8.0,2011-10-12,United Kingdom,False,son worked british airway urged fly british ai...
3832,London City-New York JFK via Shannon on A318 b...,2.0,2011-10-11,United States,False,london city new york jfk via shannon really ni...
