In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
df = pd.read_csv("reviews_data.csv")

In [4]:
df.head()

Unnamed: 0,Reviews,Stars,Date,Country
0,Not Verified | I did not actually get to fly w...,5.0,5th February 2024,United Kingdom
1,✅ Trip Verified | We had possibly the worse ch...,1.0,2nd February 2024,United Kingdom
2,✅ Trip Verified | I flew to LHR from ATH in C...,6.0,30th January 2024,Japan
3,✅ Trip Verified | I like the British Airways ...,9.0,29th January 2024,United Kingdom
4,✅ Trip Verified | I have come to boarding and...,8.0,28th January 2024,Ukraine


### 1.Reviews Column

In [5]:
# Print the first row of the "Reviews" column
print(df['Reviews'].iloc[0])

Not Verified | I did not actually get to fly with BA as they cancelled the flight with 3 days notice and refused to offer an acceptable alternative, resulting in 2 people losing their holiday (a river cruise). Since then, as appears to be their standard protocol, they have ignored every communication regarding their obligations for compensation and I have no option but to pursue them via official channels. 


Spliting the review column into two new columns:
* one for verification purposes boolean value (Trip Verified - True, Trip Not Verified )
* Second column consisting of just reviews in lower case.

In [6]:
# Check if the 'Reviews' column contains the text "Verified" and create a boolean column 'Verified'
df['Verified'] = df['Reviews'].str.contains('Trip Verified', case=False)

# Display the modified DataFrame
print(df.head())

                                             Reviews  Stars  \
0  Not Verified | I did not actually get to fly w...    5.0   
1  ✅ Trip Verified | We had possibly the worse ch...    1.0   
2  ✅ Trip Verified |  I flew to LHR from ATH in C...    6.0   
3  ✅ Trip Verified |  I like the British Airways ...    9.0   
4  ✅ Trip Verified |  I have come to boarding and...    8.0   

                Date         Country  Verified  
0  5th February 2024  United Kingdom     False  
1  2nd February 2024  United Kingdom      True  
2  30th January 2024           Japan      True  
3  29th January 2024  United Kingdom      True  
4  28th January 2024         Ukraine      True  


In [7]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\grahu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\grahu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

# Create an instance of WordNetLemmatizer
lemma = WordNetLemmatizer() 

# Assuming 'Reviews' is the correct column name in your DataFrame
reviews_data = df['Reviews'].str.replace('✅ Trip Verified |', '')

# Create an empty list to collect cleaned data
processed_reviews = []

# Loop through each review, remove punctuations, convert to lowercase, lemmatize, and add to processed_reviews
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]', ' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    processed_reviews.append(rev)

In [9]:
df['New_reviews'] = processed_reviews

In [10]:
df.head()

Unnamed: 0,Reviews,Stars,Date,Country,Verified,New_reviews
0,Not Verified | I did not actually get to fly w...,5.0,5th February 2024,United Kingdom,False,verified actually get fly ba cancelled flight ...
1,✅ Trip Verified | We had possibly the worse ch...,1.0,2nd February 2024,United Kingdom,True,possibly worse check experience ever ever took...
2,✅ Trip Verified | I flew to LHR from ATH in C...,6.0,30th January 2024,Japan,True,flew lhr ath club europe ba dec transfer jal f...
3,✅ Trip Verified | I like the British Airways ...,9.0,29th January 2024,United Kingdom,True,like british airway world traveller plus produ...
4,✅ Trip Verified | I have come to boarding and...,8.0,28th January 2024,Ukraine,True,come boarding cabin luggage taken plane full a...


### 2. Date Column

In [11]:
# convert the date to datetime format
from dateutil import parser
df['Date'] = df['Date'].apply(lambda x: parser.parse(x, fuzzy=True))

In [12]:
df.Date.head()

0   2024-02-05
1   2024-02-02
2   2024-01-30
3   2024-01-29
4   2024-01-28
Name: Date, dtype: datetime64[ns]

### 3. Stars Column

In [13]:
#check for unique values
df.Stars.unique()

array([ 5.,  1.,  6.,  9.,  8.,  3.,  2.,  4., 10.,  7., nan])

In [14]:
df.Stars.value_counts()

Stars
1.0     845
2.0     405
3.0     394
8.0     344
10.0    288
9.0     281
7.0     276
5.0     248
4.0     240
6.0     175
Name: count, dtype: int64

### Checking for missing values

In [15]:
# Check for missing values in the entire DataFrame
missing_values = df.isnull().sum()

# Print the count of missing values for each column
print("Missing Values:\n", missing_values)

Missing Values:
 Reviews        0
Stars          4
Date           0
Country        2
Verified       0
New_reviews    0
dtype: int64


In [18]:
# Drop missing values
# Drop rows with missing values in the specified columns
columns_to_check = ['Stars', 'Country']  # Add other columns if needed
df_cleaned = df.dropna(subset=columns_to_check)

#resetting the index
df_cleaned.reset_index(drop=True)

Unnamed: 0,Reviews,Stars,Date,Country,Verified,New_reviews
0,Not Verified | I did not actually get to fly w...,5.0,2024-02-05,United Kingdom,False,verified actually get fly ba cancelled flight ...
1,✅ Trip Verified | We had possibly the worse ch...,1.0,2024-02-02,United Kingdom,True,possibly worse check experience ever ever took...
2,✅ Trip Verified | I flew to LHR from ATH in C...,6.0,2024-01-30,Japan,True,flew lhr ath club europe ba dec transfer jal f...
3,✅ Trip Verified | I like the British Airways ...,9.0,2024-01-29,United Kingdom,True,like british airway world traveller plus produ...
4,✅ Trip Verified | I have come to boarding and...,8.0,2024-01-28,Ukraine,True,come boarding cabin luggage taken plane full a...
...,...,...,...,...,...,...
3489,LHR-PHL on a 787 Dreamliner seat 3K. I recomme...,1.0,2014-09-06,United Kingdom,False,lhr phl dreamliner seat k recommend seat k k g...
3490,Travelled with BA from Rome to LHR. Staff at R...,8.0,2014-09-06,United Kingdom,False,travelled ba rome lhr staff rome wonderful con...
3491,London to Budapest on Club World-now I don't e...,2.0,2014-09-06,Australia,False,london budapest club world even mention fact l...
3492,We flew Premium Economy on BA from Boston to L...,8.0,2014-09-06,United States,False,flew premium economy ba boston london london j...


In [19]:
df_cleaned.dtypes

Reviews                object
Stars                 float64
Date           datetime64[ns]
Country                object
Verified                 bool
New_reviews            object
dtype: object

In [20]:
df_cleaned.to_csv('cleaned_data.csv', index=False)