<a href="https://colab.research.google.com/github/saulolvieira/project_1_ws_ba/blob/main/ba_data_input_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importando as bibliotecas

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup


URL do site indicado: "https://www.airlinequality.com/airline-reviews/british-airways/page/1/?sortby=post_date%3ADesc&pagesize=100"

# Funções

In [None]:
# Função para buscar as informações básicas de cada comentário ( nota, comentário e data)
def get_basic(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'html.parser')
  comments = soup.find_all('article', attrs={'itemprop': 'review'})

  # Buscando informações básicas
  data_basic = []
  for comment in comments:
      row = {}
      row['review_id'] = comment.find('div', class_='body')['id']
      row['date'] = comment.find('meta')['content']
      score_elem = comment.find('span', attrs={'itemprop': 'ratingValue'})
      if score_elem:
        row['score'] = score_elem.text
      else:
        row['score'] = 0
      row['resume'] = comment.find('h2', class_='text_header').text
      row['review'] = comment.find('div', class_='text_content').text
      data_basic.append(row)

  df_basic = pd.DataFrame(data_basic)
  return df_basic

In [None]:
# Função para buscar todas as informações analíticas de cada comentário de cada página (tabela depois do comentário)

def get_analytical(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'html.parser')
  comments = soup.find_all('article', attrs={'itemprop': 'review'})

  # Buscando informações básicas


  # Encontrando todos os ids

  id_ = []
  for comment in comments:
    row = {}
    row['id'] = comment.find('div', class_='body')['id']
    id_.append(row)

  df_id = pd.DataFrame(id_)


  # Encontrar todos os parâmetros de todos os id's do dataframe:

  df_results = pd.DataFrame()

  for i in df_id['id']:
    # Conteúdo da página para cada id
    div = soup.find('div', {'class': 'body', 'id': i})
    div = BeautifulSoup(str(div), 'html.parser')

    # Listas vazias para adicionar os valores de cada coluna
    titles = []
    values = []
    review_id = i

    # encontre todas as tags <tr> de cada elemento da tabela
    rows = [element.find_all('tr') for element in div]
    # desempacote a lista de listas em uma única lista
    rows = [tr for sublist in rows for tr in sublist]

  # extrair títulos e valores da tabela
    for row in rows:
        title_element = row.find('td', {'class': 'review-rating-header'})
        value_element = row.find('td', {'class': 'review-value'})

        if title_element is not None and value_element is not None:
            title = title_element.text.strip()
            value = value_element.text.strip()
            titles.append(title)
            values.append(value)

        # caso exista a classe review-rating-stars, também é possível coletar as informações
        rating_stars = row.find('td', {'class': 'review-rating-stars'})
        if rating_stars is not None:
            title = title_element.text.strip()
            star_spans = rating_stars.find_all('span', {'class': 'star'})
            star_fill_spans = [span for span in star_spans if 'fill' in span['class']]
            value = str(len(star_fill_spans))
            titles.append(title)
            values.append(value)

    # criar um dicionário com as listas de títulos e valores
    data = {'review_id': review_id, 'titles': titles, 'values': values}
    df = pd.DataFrame(data)
    df_results = pd.concat([df_results, df])

  df_analytical = df_results.pivot(index='review_id', columns='titles', values='values')

  return df_analytical

# Iterações

In [None]:
#Buscando as infomações analíticas de todos os comentários da página

# define a quantidade de páginas a serem percorridas
num_pages = 36
all_results = []

# loop para chamar a função para cada página
for i in range(1, num_pages+1):
    url = f"https://www.airlinequality.com/airline-reviews/british-airways/page/{i}/?sortby=post_date%3ADesc&pagesize=100"
    scraps = get_analytical(url)
    all_results.append(scraps)

# concatena todos os dataframes em um único dataframe
df = pd.concat(all_results)

df = df.reset_index()
# imprime o dataframe final

df

titles,review_id,Aircraft,Cabin Staff Service,Date Flown,Food & Beverages,Ground Service,Inflight Entertainment,Recommended,Route,Seat Comfort,Seat Type,Type Of Traveller,Value For Money,Wifi & Connectivity
0,anchor840711,,2,March 2023,1,1,1,no,Zagreb to London Heathrow,3,Economy Class,Solo Leisure,2,1
1,anchor840775,A320,3,March 2023,2,5,,yes,Gatwick to Dubrovnik,1,Economy Class,Couple Leisure,5,
2,anchor840777,,1,March 2023,1,1,,no,London Heathrow to Amsterdam,2,Economy Class,Business,1,
3,anchor840997,,3,March 2023,3,1,,no,Heathrow to Belfast City,3,Economy Class,Couple Leisure,1,
4,anchor841156,,3,March 2023,,1,,no,Munich to London Heathrow,3,Economy Class,Solo Leisure,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3590,anchor244109,,3,,1,,4,no,,2,Economy Class,,3,
3591,anchor244110,,3,,3,,4,yes,,4,Business Class,,0,
3592,anchor244116,,3,,3,,4,yes,,4,Premium Economy,,4,
3593,anchor244120,,3,,2,,4,yes,,4,Business Class,,3,


In [None]:
#Buscando as infomações básicas de todos os comentários da página

num_pages = 36
all_results2 = []

# loop para chamar a função para cada página
for i in range(1, num_pages+1):
    url2 = f"https://www.airlinequality.com/airline-reviews/british-airways/page/{i}/?sortby=post_date%3ADesc&pagesize=100"
    scraps2 = get_basic(url2)
    all_results2.append(scraps2)

# concatena todos os dataframes em um único dataframe
df2 = pd.concat(all_results2)

df2 = df2.reset_index()
# imprime o dataframe final

df2

Unnamed: 0,index,review_id,date,score,resume,review
0,0,anchor861589,2023-07-06,4,"""short-changing passengers""",Not Verified | BA is not treating its premium ...
1,1,anchor861382,2023-07-05,1,"""Economy is absolutely awful""",✅ Trip Verified | 24 hours before our departu...
2,2,anchor861357,2023-07-05,1,"""Shocking customer service""",✅ Trip Verified | We arrived at Heathrow at 0...
3,3,anchor861255,2023-07-04,3,"""no representative to help""",✅ Trip Verified | Original flight was cancell...
4,4,anchor861048,2023-07-03,3,"""Boarding was chaotic""",Not Verified | Airport check in was functiona...
...,...,...,...,...,...,...
3590,90,anchor244110,2012-08-29,6,British Airways customer review,HKG-LHR in New Club World on Boeing 777-300 - ...
3591,91,anchor243823,2012-08-28,9,British Airways customer review,LHR to HAM. Purser addresses all club passenge...
3592,92,anchor243776,2011-10-12,5,British Airways customer review,My son who had worked for British Airways urge...
3593,93,anchor243824,2011-10-11,4,British Airways customer review,London City-New York JFK via Shannon on A318 b...


# Consolidação final

In [None]:
df_final = pd.merge(df2, df, on = 'review_id', how = 'inner')
df_final

Unnamed: 0,index,review_id,date,score,resume,review,Aircraft,Cabin Staff Service,Date Flown,Food & Beverages,Ground Service,Inflight Entertainment,Recommended,Route,Seat Comfort,Seat Type,Type Of Traveller,Value For Money,Wifi & Connectivity
0,0,anchor861589,2023-07-06,4,"""short-changing passengers""",Not Verified | BA is not treating its premium ...,Boeing 777 -200,4,June 2023,4,3,3,no,Kingston to London,5,Premium Economy,Family Leisure,3,
1,1,anchor861382,2023-07-05,1,"""Economy is absolutely awful""",✅ Trip Verified | 24 hours before our departu...,Boeing 777-200,3,June 2023,1,3,3,no,London Heathrow to Cape Town,1,Economy Class,Couple Leisure,2,
2,2,anchor861357,2023-07-05,1,"""Shocking customer service""",✅ Trip Verified | We arrived at Heathrow at 0...,,,July 2023,,1,,no,London Heathrow to Ibiza,,Economy Class,Couple Leisure,1,
3,3,anchor861255,2023-07-04,3,"""no representative to help""",✅ Trip Verified | Original flight was cancell...,A380,3,June 2023,3,1,3,no,Washington to London,3,Business Class,Business,1,
4,4,anchor861048,2023-07-03,3,"""Boarding was chaotic""",Not Verified | Airport check in was functiona...,A321,4,June 2023,1,2,,no,London Heathrow to Naples,1,Economy Class,Couple Leisure,2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3593,90,anchor244110,2012-08-29,6,British Airways customer review,HKG-LHR in New Club World on Boeing 777-300 - ...,,3,,3,,4,yes,,4,Business Class,,0,
3594,91,anchor243823,2012-08-28,9,British Airways customer review,LHR to HAM. Purser addresses all club passenge...,,5,,4,,0,yes,,4,Business Class,,3,
3595,92,anchor243776,2011-10-12,5,British Airways customer review,My son who had worked for British Airways urge...,,,,,,,yes,,,Economy Class,,4,
3596,93,anchor243824,2011-10-11,4,British Airways customer review,London City-New York JFK via Shannon on A318 b...,,3,,5,,0,no,,1,Premium Economy,,1,


In [None]:
#df_final.to_csv('analytical_british_airways.csv', index=False)