# Data Cleaning


##### After extracting data from the website,  reviews section will need to be cleaned for punctuations, spellings and other characters.

In [1]:
#imports

import pandas as pd
import matplotlib.pyplot as plt
import os

#regex
import re

In [2]:
!pip3 install seaborn



In [13]:
!pip3 install nltk



In [20]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [27]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...


True

In [28]:
import seaborn as sns

In [29]:
import os
cwd = os.getcwd()

df = pd.read_csv(cwd+"/BA_reviews.csv", index_col=0)


In [30]:
df.head()

Unnamed: 0,reviews,date,stars,country
0,✅ Trip Verified | Probably the worst business ...,2nd January 2023,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,United States
1,"✅ Trip Verified | Definitely not recommended, ...",2nd January 2023,1,United States
2,✅ Trip Verified | BA shuttle service across t...,2nd January 2023,2,United Kingdom
3,✅ Trip Verified | I must admit like many other...,1st January 2023,8,United Kingdom
4,Not Verified | When will BA update their Busi...,30th December 2022,6,United Kingdom


Creating a column which mentions if the user is verified or not.

In [31]:
df['verified'] = df.reviews.str.contains("Trip Verified")

In [32]:
df['verified']

0        True
1        True
2        True
3        True
4       False
        ...  
3446    False
3447    False
3448    False
3449    False
3450    False
Name: verified, Length: 3451, dtype: bool

# Cleaning Reviews

##### Extracting the column of reviews into a separate dataframe and clean it for semantic analysis

In [33]:
#for lemmatization of words we will use nltk library
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()


reviews_data = df.reviews.str.strip("✅ Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus =[]

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

In [34]:
df['corpus'] = corpus

In [35]:
df.head()

Unnamed: 0,reviews,date,stars,country,verified,corpus
0,✅ Trip Verified | Probably the worst business ...,2nd January 2023,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,United States,True,probably worst business class experience ever ...
1,"✅ Trip Verified | Definitely not recommended, ...",2nd January 2023,1,United States,True,definitely recommended especially business cla...
2,✅ Trip Verified | BA shuttle service across t...,2nd January 2023,2,United Kingdom,True,ba shuttle service across uk still surprisingl...
3,✅ Trip Verified | I must admit like many other...,1st January 2023,8,United Kingdom,True,must admit like many others tend avoid ba long...
4,Not Verified | When will BA update their Busi...,30th December 2022,6,United Kingdom,False,verified ba update business class cabin across...


# Cleaning anf Format data

In [36]:
df.dtypes

reviews     object
date        object
stars       object
country     object
verified      bool
corpus      object
dtype: object

In [37]:
# convert the date to datetime format

df.date = pd.to_datetime(df.date)

In [38]:
df.date.head()

0   2023-01-02
1   2023-01-02
2   2023-01-02
3   2023-01-01
4   2022-12-30
Name: date, dtype: datetime64[ns]

# Cleaning ratings with stars

In [39]:
df.stars.unique()

array(['\n\t\t\t\t\t\t\t\t\t\t\t\t\t5', '1', '2', '8', '6', '4', '3', '5',
       '9', '7', '10', 'None'], dtype=object)

In [40]:
df.stars = df.stars.str.strip("\n\t\t\t\t\t\t\t\t\t\t\t\t\t")

In [41]:
df.stars.value_counts()

1       743
2       387
3       384
8       352
10      307
7       303
9       294
5       259
4       231
6       186
None      5
Name: stars, dtype: int64

In [42]:
df.drop(df[df.stars == "None"].index, axis=0, inplace=True)

In [43]:
df.stars.unique()

array(['5', '1', '2', '8', '6', '4', '3', '9', '7', '10'], dtype=object)

In [44]:
df.isnull().value_counts()


reviews  date   stars  country  verified  corpus
False    False  False  False    False     False     3444
                       True     False     False        2
dtype: int64

In [45]:
df.country.isnull().value_counts()


False    3444
True        2
Name: country, dtype: int64

In [47]:
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [48]:
df.shape

(3444, 6)

In [49]:
df.reset_index(drop=True)

Unnamed: 0,reviews,date,stars,country,verified,corpus
0,✅ Trip Verified | Probably the worst business ...,2023-01-02,5,United States,True,probably worst business class experience ever ...
1,"✅ Trip Verified | Definitely not recommended, ...",2023-01-02,1,United States,True,definitely recommended especially business cla...
2,✅ Trip Verified | BA shuttle service across t...,2023-01-02,2,United Kingdom,True,ba shuttle service across uk still surprisingl...
3,✅ Trip Verified | I must admit like many other...,2023-01-01,8,United Kingdom,True,must admit like many others tend avoid ba long...
4,Not Verified | When will BA update their Busi...,2022-12-30,6,United Kingdom,False,verified ba update business class cabin across...
...,...,...,...,...,...,...
3439,YYZ to LHR - July 2012 - I flew overnight in p...,2012-08-29,7,Canada,False,yyz lhr july flew overnight premium economy ch...
3440,LHR to HAM. Purser addresses all club passenge...,2012-08-28,1,United Kingdom,False,lhr ham purser address club passenger name boa...
3441,My son who had worked for British Airways urge...,2011-10-12,9,United Kingdom,False,son worked british airway urged fly british ai...
3442,London City-New York JFK via Shannon on A318 b...,2011-10-11,8,United States,False,london city new york jfk via shannon really ni...


In [50]:
df.to_csv(cwd + "/cleaned-BA-reviews.csv")