In [12]:
#imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#regex
import re

In [13]:
#create a dataframe from csv file

cwd = os.getcwd()

df = pd.read_csv(cwd+"/BA_reviews.csv", index_col=0)

In [None]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,Not Verified | Top Ten REASONS to not use Brit...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,23rd May 2023,United States
1,Not Verified | Easy check in on the way to He...,1,23rd May 2023,Spain
2,✅ Trip Verified | Online check in worked fine...,10,23rd May 2023,Chile
3,✅ Trip Verified |. The BA first lounge at Term...,10,22nd May 2023,United Kingdom
4,Not Verified | Paid a quick visit to Nice yest...,2,22nd May 2023,United Kingdom


In [14]:
df['verified'] = df.reviews.str.contains("Trip Verified")


In [2]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
import nltk

In [19]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**CLEANING REVIEWS**

In [20]:
#for lemmatization of words we will use nltk library
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()


reviews_data = df.reviews.str.strip("✅ Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus =[]

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

In [21]:
# add the corpus to the original dataframe

df['corpus'] = corpus

In [22]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | Top Ten REASONS to not use Brit...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,23rd May 2023,United States,False,verified top ten reason use british airway awa...
1,Not Verified | Easy check in on the way to He...,1,23rd May 2023,Spain,False,verified easy check way heathrow flight time i...
2,✅ Trip Verified | Online check in worked fine...,10,23rd May 2023,Chile,True,online check worked fine quick security check ...
3,✅ Trip Verified |. The BA first lounge at Term...,10,22nd May 2023,United Kingdom,True,ba first lounge terminal zoo pm dirty table us...
4,Not Verified | Paid a quick visit to Nice yest...,2,22nd May 2023,United Kingdom,False,verified paid quick visit nice yesterday heath...


**CLEANING FORMAT/DATE**

In [23]:
df.dtypes

reviews     object
stars       object
date        object
country     object
verified      bool
corpus      object
dtype: object

In [24]:
# convert the date to datetime format

df.date = pd.to_datetime(df.date)

In [25]:
df.date.head()

0   2023-05-23
1   2023-05-23
2   2023-05-23
3   2023-05-22
4   2023-05-22
Name: date, dtype: datetime64[ns]

**Cleaning Ratings with stars**

In [26]:
#check for unique values
df.stars.unique()

array(['\n\t\t\t\t\t\t\t\t\t\t\t\t\t5', '1', '10', '2', '4', '3', '5',
       '9', '7', '8', '6', 'None'], dtype=object)

In [27]:
# remove the \t and \n from the ratings
df.stars = df.stars.str.strip("\n\t\t\t\t\t\t\t\t\t\t\t\t\t")

In [38]:
df.stars.value_counts()

1     772
2     389
3     388
8     340
10    295
9     287
7     285
5     249
4     234
6     173
Name: stars, dtype: int64

**There are 4 rows having values "None" in the ratings. We will drop all these 4 rows**

In [29]:
# drop the rows where the value of ratings is None
df.drop(df[df.stars == "None"].index, axis=0, inplace=True)

In [30]:
#check the unique values again
df.stars.unique()

array(['5', '1', '10', '2', '4', '3', '9', '7', '8', '6'], dtype=object)

## **Check for null Values**

In [31]:
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     3412
                       True     False     False        2
dtype: int64

In [32]:
df.country.isnull().value_counts()

False    3412
True        2
Name: country, dtype: int64

In [33]:
#drop the rows using index where the country value is null
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [34]:
df.shape

(3412, 6)

In [35]:
#resetting the index
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | Top Ten REASONS to not use Brit...,5,2023-05-23,United States,False,verified top ten reason use british airway awa...
1,Not Verified | Easy check in on the way to He...,1,2023-05-23,Spain,False,verified easy check way heathrow flight time i...
2,✅ Trip Verified | Online check in worked fine...,10,2023-05-23,Chile,True,online check worked fine quick security check ...
3,✅ Trip Verified |. The BA first lounge at Term...,10,2023-05-22,United Kingdom,True,ba first lounge terminal zoo pm dirty table us...
4,Not Verified | Paid a quick visit to Nice yest...,2,2023-05-22,United Kingdom,False,verified paid quick visit nice yesterday heath...
...,...,...,...,...,...,...
3407,I flew AMS-LGW-MCO and JFK-LHR-AMS this month....,9,2014-07-01,Netherlands,False,flew am lgw mco jfk lhr am month overall accep...
3408,CPH-LHR 26th June. Club Europe. BA standards a...,6,2014-06-30,Denmark,False,cph lhr th june club europe ba standard slippi...
3409,London LHR to Hong Kong on 27 May. Was really ...,2,2014-06-30,United Kingdom,False,london lhr hong kong may really looking forwar...
3410,Flew LHR to Larnaca and return 22nd April and ...,3,2014-06-30,United Kingdom,False,flew lhr larnaca return nd april th april club...


In [37]:
# export the cleaned data

df.to_csv(cwd + "/cleaned-BA-reviews.csv")