## Scraping Data

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 10
page_size = 100

reviews = []

for i in range(1, pages + 1):
    print(f'Scraping page {i}')
    
    url = f'{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}'
    
    response = requests.get(url)
    content = response.text
    parsed_content = BeautifulSoup(content, 'lxml')
    for para in parsed_content.find_all("div", class_ = "text_content"):
        reviews.append(para.get_text())

Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10


In [3]:
df = pd.DataFrame()
df["reviews"] = reviews
df.head()

Unnamed: 0,reviews
0,✅ Trip Verified | Easy check in a T5. Galleri...
1,"Not Verified | Flight delayed by an hour, it ..."
2,Not Verified | The staff are very rude and not...
3,✅ Trip Verified | Good domestic flight operat...
4,Not Verified | Failed at all basic travel fund...


In [4]:
df.reviews = df.reviews.str.split('|', expand =True)[1]
df

Unnamed: 0,reviews
0,Easy check in a T5. Galleries south and Nort...
1,"Flight delayed by an hour, it happens, no bi..."
2,The staff are very rude and not trained prope...
3,Good domestic flight operated by BA Cityflye...
4,Failed at all basic travel fundamentals: 1) O...
...,...
995,Johannesburg to London. I tend to stay with ...
996,Singapore to London Heathrow. It's my first ...
997,London to Los Angeles. Booked this trip eigh...
998,Miami to London Heathrow. As with many other...


## Processing Data

### Cleaning text

In [5]:
import re

def clean(text):
    text = re.sub('[^a-zA-z]+', ' ', str(text))
    return text

df['Cleaned Reviews'] = df.reviews.apply(clean)
df.head()

Unnamed: 0,reviews,Cleaned Reviews
0,Easy check in a T5. Galleries south and Nort...,Easy check in a T Galleries south and North l...
1,"Flight delayed by an hour, it happens, no bi...",Flight delayed by an hour it happens no biggi...
2,The staff are very rude and not trained prope...,The staff are very rude and not trained prope...
3,Good domestic flight operated by BA Cityflye...,Good domestic flight operated by BA Cityflyer...
4,Failed at all basic travel fundamentals: 1) O...,Failed at all basic travel fundamentals Our f...


In [6]:
import nltk
nltk.download("punkt")

from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to C:\Users\Raghav
[nltk_data]     Goel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Raghav
[nltk_data]     Goel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Raghav
[nltk_data]     Goel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N': wordnet.NOUN, 'R':wordnet.ADV}
def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    #print(tags)
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
            newlist.append(tuple([word, pos_dict.get(tag[0])]))
            #print(tag[0])
            #print(pos_dict.get(tag[0]))
    return newlist
    
df["POS Tagged"] = df['Cleaned Reviews'].apply(token_stop_pos)
df.head()

[nltk_data] Downloading package omw-1.4 to C:\Users\Raghav
[nltk_data]     Goel\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Raghav Goel\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,reviews,Cleaned Reviews,POS Tagged
0,Easy check in a T5. Galleries south and Nort...,Easy check in a T Galleries south and North l...,"[(Easy, a), (check, n), (Galleries, n), (south..."
1,"Flight delayed by an hour, it happens, no bi...",Flight delayed by an hour it happens no biggi...,"[(Flight, n), (delayed, v), (hour, n), (happen..."
2,The staff are very rude and not trained prope...,The staff are very rude and not trained prope...,"[(staff, n), (rude, a), (trained, v), (properl..."
3,Good domestic flight operated by BA Cityflye...,Good domestic flight operated by BA Cityflyer...,"[(Good, a), (domestic, a), (flight, n), (opera..."
4,Failed at all basic travel fundamentals: 1) O...,Failed at all basic travel fundamentals Our f...,"[(Failed, v), (basic, a), (travel, n), (fundam..."
