In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()

#Take off display limitations on Pandas
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

dataset = pd.read_csv("customer reviews.csv")

In [2]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,reviews,stars,date,country
0,0,✅ Trip Verified | Check in and security cleara...,5.0,18th September 2023,United Kingdom
1,1,Not Verified | British Airways has confirmed ...,6.0,16th September 2023,United Kingdom
2,2,✅ Trip Verified | Worst BA experience. I was s...,1.0,15th September 2023,South Africa
3,3,✅ Trip Verified | My daughter and I were deni...,2.0,13th September 2023,Canada
4,4,✅ Trip Verified | Despite boarding being the u...,1.0,11th September 2023,United Kingdom


In [3]:
dataset['verified'] = dataset.reviews.str.contains("Trip Verified")

In [4]:
dataset['verified']

0        True
1       False
2        True
3        True
4        True
5       False
6       False
7        True
8        True
9        True
10       True
11       True
12       True
13       True
14       True
15       True
16       True
17       True
18       True
19       True
20      False
21       True
22       True
23       True
24       True
25       True
26       True
27       True
28      False
29      False
30       True
31      False
32       True
33       True
34       True
35       True
36       True
37       True
38       True
39       True
40       True
41       True
42       True
43      False
44       True
45       True
46      False
47       True
48      False
49       True
50       True
51      False
52      False
53       True
54       True
55       True
56      False
57       True
58       True
59       True
60      False
61       True
62       True
63       True
64      False
65       True
66       True
67       True
68       True
69       True
70      False
71    

In [5]:
review_data = dataset.reviews.str.strip("✅ Trip Verified |")

corpus = []

for review in review_data:
    review = re.sub('[^a-zA-Z]',' ', review)
    review = review.lower()
    review = review.split()
    review = [lemma.lemmatize(word) for word in review if word not in set(stopwords.words("english"))]
    review = " ".join(review)
    corpus.append(review)

In [6]:
dataset['corpus'] = corpus

In [7]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,reviews,stars,date,country,verified,corpus
0,0,✅ Trip Verified | Check in and security cleara...,5.0,18th September 2023,United Kingdom,True,check security clearance quick lhr wife used g...
1,1,Not Verified | British Airways has confirmed ...,6.0,16th September 2023,United Kingdom,False,verified british airway confirmed worst airlin...
2,2,✅ Trip Verified | Worst BA experience. I was s...,1.0,15th September 2023,South Africa,True,worst ba experience supposed fly italy septemb...
3,3,✅ Trip Verified | My daughter and I were deni...,2.0,13th September 2023,Canada,True,daughter denied boarding business class flight...
4,4,✅ Trip Verified | Despite boarding being the u...,1.0,11th September 2023,United Kingdom,True,despite boarding usual free lhr group called b...


In [8]:
dataset.dtypes

Unnamed: 0      int64
reviews        object
stars         float64
date           object
country        object
verified         bool
corpus         object
dtype: object

In [9]:
dataset.date.head()

0    18th September 2023
1    16th September 2023
2    15th September 2023
3    13th September 2023
4    11th September 2023
Name: date, dtype: object

In [10]:
dataset['date'] = dataset['date'].str.replace(r'\b(\d+)(st|nd|rd|th)\b', r'\1', regex=True)

In [11]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,reviews,stars,date,country,verified,corpus
0,0,✅ Trip Verified | Check in and security cleara...,5.0,18 September 2023,United Kingdom,True,check security clearance quick lhr wife used g...
1,1,Not Verified | British Airways has confirmed ...,6.0,16 September 2023,United Kingdom,False,verified british airway confirmed worst airlin...
2,2,✅ Trip Verified | Worst BA experience. I was s...,1.0,15 September 2023,South Africa,True,worst ba experience supposed fly italy septemb...
3,3,✅ Trip Verified | My daughter and I were deni...,2.0,13 September 2023,Canada,True,daughter denied boarding business class flight...
4,4,✅ Trip Verified | Despite boarding being the u...,1.0,11 September 2023,United Kingdom,True,despite boarding usual free lhr group called b...


In [12]:
dataset['date'] = pd.to_datetime(dataset.date)

In [13]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,reviews,stars,date,country,verified,corpus
0,0,✅ Trip Verified | Check in and security cleara...,5.0,2023-09-18,United Kingdom,True,check security clearance quick lhr wife used g...
1,1,Not Verified | British Airways has confirmed ...,6.0,2023-09-16,United Kingdom,False,verified british airway confirmed worst airlin...
2,2,✅ Trip Verified | Worst BA experience. I was s...,1.0,2023-09-15,South Africa,True,worst ba experience supposed fly italy septemb...
3,3,✅ Trip Verified | My daughter and I were deni...,2.0,2023-09-13,Canada,True,daughter denied boarding business class flight...
4,4,✅ Trip Verified | Despite boarding being the u...,1.0,2023-09-11,United Kingdom,True,despite boarding usual free lhr group called b...


In [14]:
dataset = dataset.reset_index(drop=True)

In [15]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,reviews,stars,date,country,verified,corpus
0,0,✅ Trip Verified | Check in and security cleara...,5.0,2023-09-18,United Kingdom,True,check security clearance quick lhr wife used g...
1,1,Not Verified | British Airways has confirmed ...,6.0,2023-09-16,United Kingdom,False,verified british airway confirmed worst airlin...
2,2,✅ Trip Verified | Worst BA experience. I was s...,1.0,2023-09-15,South Africa,True,worst ba experience supposed fly italy septemb...
3,3,✅ Trip Verified | My daughter and I were deni...,2.0,2023-09-13,Canada,True,daughter denied boarding business class flight...
4,4,✅ Trip Verified | Despite boarding being the u...,1.0,2023-09-11,United Kingdom,True,despite boarding usual free lhr group called b...


In [16]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3500 entries, 0 to 3499
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Unnamed: 0  3500 non-null   int64         
 1   reviews     3500 non-null   object        
 2   stars       3496 non-null   float64       
 3   date        3500 non-null   datetime64[ns]
 4   country     3498 non-null   object        
 5   verified    3500 non-null   bool          
 6   corpus      3500 non-null   object        
dtypes: bool(1), datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 167.6+ KB


In [17]:
#drop the rows using index where the country value is null
dataset.drop(dataset[dataset.stars.isnull() == True].index, axis=0, inplace=True)
dataset.drop(dataset[dataset.country.isnull() == True].index, axis=0, inplace=True)

In [18]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3494 entries, 0 to 3499
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Unnamed: 0  3494 non-null   int64         
 1   reviews     3494 non-null   object        
 2   stars       3494 non-null   float64       
 3   date        3494 non-null   datetime64[ns]
 4   country     3494 non-null   object        
 5   verified    3494 non-null   bool          
 6   corpus      3494 non-null   object        
dtypes: bool(1), datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 194.5+ KB


In [20]:
dataset.reset_index(drop=True)

Unnamed: 0.1,Unnamed: 0,reviews,stars,date,country,verified,corpus
0,0,✅ Trip Verified | Check in and security cleara...,5.0,2023-09-18,United Kingdom,True,check security clearance quick lhr wife used g...
1,1,Not Verified | British Airways has confirmed ...,6.0,2023-09-16,United Kingdom,False,verified british airway confirmed worst airlin...
2,2,✅ Trip Verified | Worst BA experience. I was s...,1.0,2023-09-15,South Africa,True,worst ba experience supposed fly italy septemb...
3,3,✅ Trip Verified | My daughter and I were deni...,2.0,2023-09-13,Canada,True,daughter denied boarding business class flight...
4,4,✅ Trip Verified | Despite boarding being the u...,1.0,2023-09-11,United Kingdom,True,despite boarding usual free lhr group called b...
5,5,"Not Verified | Flight cancelled, no crew! 9th...",8.0,2023-09-10,United Kingdom,False,verified flight cancelled crew th september pe...
6,6,"Not Verified | The worst service ever, my bag...",1.0,2023-09-09,Kuwait,False,verified worst service ever baggage arrive tim...
7,7,✅ Trip Verified | 4/4 flights we booked this ...,1.0,2023-09-06,Germany,True,flight booked holiday delayed hour wifi availa...
8,8,✅ Trip Verified | British Airways has a total...,1.0,2023-09-04,United Kingdom,True,british airway total lack respect customer boo...
9,9,"✅ Trip Verified | London Heathrow to Keflavik,...",1.0,2023-09-04,Iceland,True,london heathrow keflavik iceland business clas...


In [21]:
dataset.to_csv('cleaned reviews.csv')