In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np

In [2]:
# add your user agent 
#HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'})

#Web Page Url
URL = "https://www.airlinequality.com/airline-reviews/emirates/"
pages = 11
page_size = 1000

reviews = []

for i in range(1, pages):
    print(f"Scrapping page {i}")
    
    # Create URL to collect links from paginated data
    url = f"{URL}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    # Collect HTML data from the chosen page
    response = requests.get(url)
    
    # Parse the content
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    for para in parsed_content.find_all("div", {"class": "text_content"}):
        reviews.append(para.get_text())
    
    print(f"   ---> {len(reviews)} total reviews")

Scrapping page 1
   ---> 1000 total reviews
Scrapping page 2
   ---> 2000 total reviews
Scrapping page 3
   ---> 2262 total reviews
Scrapping page 4
   ---> 2262 total reviews
Scrapping page 5
   ---> 2262 total reviews
Scrapping page 6
   ---> 2262 total reviews
Scrapping page 7
   ---> 2262 total reviews
Scrapping page 8
   ---> 2262 total reviews
Scrapping page 9
   ---> 2262 total reviews
Scrapping page 10
   ---> 2262 total reviews


In [3]:
#Save scrapping result into pandas DataFrame
df = pd.DataFrame()
df['reviews'] = reviews
df.head()
#df.to_csv('emirates_airline_reviews.csv', header=True, index=False)

Unnamed: 0,reviews
0,✅ Trip Verified | I have travelled a lot with...
1,✅ Trip Verified | Fantastic service from host...
2,✅ Trip Verified | This must be the worst food...
3,Not Verified | I strongly recommend flying wi...
4,"Not Verified | Unfortunately, my experience on..."


**Understanding the Pattern**

After scrapping the data, analys them to gain insights

In [4]:
df

Unnamed: 0,reviews
0,✅ Trip Verified | I have travelled a lot with...
1,✅ Trip Verified | Fantastic service from host...
2,✅ Trip Verified | This must be the worst food...
3,Not Verified | I strongly recommend flying wi...
4,"Not Verified | Unfortunately, my experience on..."
...,...
2257,Cape Town to Bangkok-departed on time excellen...
2258,Flew 6 flights recently in business with Emira...
2259,The fleets are new comfy. Flew Narita-Dubai-Mu...
2260,NBO-SIN-NBO. First leg was alright but nothing...


There are three types od pattern should we cleaned

**First condition with `✅ Trip Verified` words in every begining reviews**

For example, 

`✅ Trip Verified | I remember that we were very satisfied with this airline until a few years ago. But now it has been 3 times that we are very dissatisfied with the quality of food and the number of times served on a long flight! Why should we feel hungry on such a long flight and when we asked for food, they only give a small packet of crackers and we have to wait for several hours until it is time to serve their disgusting food? It was not like that before, that's why we surprised!`

**Second condition with `Not Verified` words in every begining reviews**

For example,

`Not Verified | I was pleasantly surprised by level of service and experience on this flight. Everything went smooth, food was very tasty, great choices of dishes and beverages. Overall, staff was very helpful always trying to accommodate the customer needs`

**Third condition with directly going to review**

`Flew from Durban to London Gatwick via Dubai. The first leg of the flight was awful - seat backrest was broken staff could not fix it so they moved us to two different seats which were even worse with a solid lump at the base of the backrests that made it impossible to get comfortable in any position on this long flight. The second leg was great in a two-week old aircraft where we could finally sit in comfort. Unfortunately the Durban-Dubai experience has put me off Emirates.`

**DATA CLEANSING**

In this part, I'm gonna drop unimportant data and focus on important part of data. The goal is to prepare well-clean data frame and ready to visualize

In [5]:
def process_reviews(reviews):
    if reviews.startswith('✅ Trip Verified'):
        return reviews.split('|', 1)[1]
    elif reviews.startswith('Not Verified'):
        return reviews.split('|', 1)[1]
    else:
        return reviews

In [6]:
#Applied process_reviews() function pada kolom reviews DataFrama
df['reviews'] = df['reviews'].apply(process_reviews)

In [7]:
df

Unnamed: 0,reviews
0,I have travelled a lot with Emirates but thi...
1,Fantastic service from hostess Malek from Tu...
2,This must be the worst food I have ever tast...
3,I strongly recommend flying with a different...
4,"Unfortunately, my experience on Emirates airl..."
...,...
2257,Cape Town to Bangkok-departed on time excellen...
2258,Flew 6 flights recently in business with Emira...
2259,The fleets are new comfy. Flew Narita-Dubai-Mu...
2260,NBO-SIN-NBO. First leg was alright but nothing...


In [8]:
#Import re (Regular Expression)
import re
def clean(text):
    # Removes all special characters and numericals leaving the alphabets
    text = re.sub('[^A-Za-z]+', ' ', str(text))
    return text

#Cleaned reviews columns
df['Cleaned Reviews'] = df['reviews'].apply(clean)
df

Unnamed: 0,reviews,Cleaned Reviews
0,I have travelled a lot with Emirates but thi...,I have travelled a lot with Emirates but this...
1,Fantastic service from hostess Malek from Tu...,Fantastic service from hostess Malek from Tun...
2,This must be the worst food I have ever tast...,This must be the worst food I have ever taste...
3,I strongly recommend flying with a different...,I strongly recommend flying with a different ...
4,"Unfortunately, my experience on Emirates airl...",Unfortunately my experience on Emirates airli...
...,...,...
2257,Cape Town to Bangkok-departed on time excellen...,Cape Town to Bangkok departed on time excellen...
2258,Flew 6 flights recently in business with Emira...,Flew flights recently in business with Emirate...
2259,The fleets are new comfy. Flew Narita-Dubai-Mu...,The fleets are new comfy Flew Narita Dubai Mun...
2260,NBO-SIN-NBO. First leg was alright but nothing...,NBO SIN NBO First leg was alright but nothing ...


In [9]:
import nltk

"""This punkt tokenizer divides a text into a list of sentences by using an unsupervised algorithm to build a model for abbreviation words, 
collocations, and words that start sentences. """

nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User_Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User_Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User_Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
#The nltk.corpus package defines a collection of corpus reader classes, which can be used to access the contents of a diverse set of corpora.


nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

# POS tagger dictionary
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    #print(tags)
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
          newlist.append(tuple([word, pos_dict.get(tag[0])]))
          #print(tag[0])
          #print(pos_dict.get(tag[0]))
    return newlist 

df['POS tagged'] = df['Cleaned Reviews'].apply(token_stop_pos)

df.head()

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User_Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User_Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,reviews,Cleaned Reviews,POS tagged
0,I have travelled a lot with Emirates but thi...,I have travelled a lot with Emirates but this...,"[(travelled, v), (lot, n), (Emirates, n), (tim..."
1,Fantastic service from hostess Malek from Tu...,Fantastic service from hostess Malek from Tun...,"[(Fantastic, a), (service, n), (hostess, a), (..."
2,This must be the worst food I have ever tast...,This must be the worst food I have ever taste...,"[(must, None), (worst, a), (food, n), (ever, r..."
3,I strongly recommend flying with a different...,I strongly recommend flying with a different ...,"[(strongly, r), (recommend, v), (flying, v), (..."
4,"Unfortunately, my experience on Emirates airl...",Unfortunately my experience on Emirates airli...,"[(Unfortunately, r), (experience, n), (Emirate..."


In [11]:
# Obtaining the stem words – Lemmatization

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
     if not pos:
        lemma = word
        lemma_rew = lemma_rew + " " + lemma
     else:
        lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
        lemma_rew = lemma_rew + " " + lemma
    return lemma_rew

df['Lemma'] = df['POS tagged'].apply(lemmatize)
df.head()

Unnamed: 0,reviews,Cleaned Reviews,POS tagged,Lemma
0,I have travelled a lot with Emirates but thi...,I have travelled a lot with Emirates but this...,"[(travelled, v), (lot, n), (Emirates, n), (tim...",travel lot Emirates time kind crew Gabriel I...
1,Fantastic service from hostess Malek from Tu...,Fantastic service from hostess Malek from Tun...,"[(Fantastic, a), (service, n), (hostess, a), (...",Fantastic service hostess Malek Tunisia Frie...
2,This must be the worst food I have ever tast...,This must be the worst food I have ever taste...,"[(must, None), (worst, a), (food, n), (ever, r...",must bad food ever taste airline trip Apart ...
3,I strongly recommend flying with a different...,I strongly recommend flying with a different ...,"[(strongly, r), (recommend, v), (flying, v), (...",strongly recommend fly different airline inf...
4,"Unfortunately, my experience on Emirates airl...",Unfortunately my experience on Emirates airli...,"[(Unfortunately, r), (experience, n), (Emirate...",Unfortunately experience Emirates airline ec...
