# 可以自由挑時間的爬蟲

# Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from datetime import datetime
import math
import os 

In [2]:
# settings
pd.set_option('precision', 0)

# Functions

In [3]:
def get_airlines() -> list:
    
    '''
    get all the airlines and set the initial url to the first page of latest reviews
    '''
    
    
    # 各航空的 url
    url = 'https://www.airlinequality.com/review-pages/a-z-airline-reviews/'
    response = requests.get(url)

    soup = BeautifulSoup(response.text, "html.parser")

    review_url_list =[]

    # tab content
    tabs = soup.findAll('div',{'class' :'tabs-content'})
    tab = tabs[0].findAll('div', {'class' : 'content'})

    str_ = 'https://www.airlinequality.com/airline-reviews/'



    if len(tab)>0:    
        for idx, one in enumerate(tab):    
    #         print('tab No.', idx+1)
            for a in one.find_all('a', href=True):
                regex = re.compile(r'airline-reviews/(\S*)"')
                match = regex.search(str(a))
                url = str_+match.group(1)+'/page/1/?sortby=post_date%3ADesc&pagesize=100'
                review_url_list.append(url)
                
    return review_url_list

In [4]:
# 該航空有幾個評論/頁面
def get_pages( url : str ) -> int:
    
    # find how many page
    response = requests.get(url)

    soup = BeautifulSoup(response.text, "html.parser")

    divs = soup.findAll('div')

    # 有其他分頁
    if divs[0].find('div',{'class': 'pagination-total'}):
        div = divs[0].find('div',{'class': 'pagination-total'}).text
        #1 to 100 of 1218 Reviews
        num_reviews = re.compile(r'1 to 100 of (\d*) Reviews') 
        match = num_reviews.search(div)
        return math.ceil(int(match.group(1))/100)
    else:
        return 1

In [5]:
def has_new_review(url : str , date : str ) -> bool:
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # newest review date on the website
    tempdate = soup.find('time',{'itemprop': 'datePublished'}).text
    tempdate = re.sub(r'(\d)(st|nd|rd|th)', r'\1', tempdate)
    date_time_obj = str(datetime.strptime(tempdate, '%d %B %Y'))
    
    if date_time_obj >= date:
        return True
    else:
        return False

In [6]:
def get_review( url : str ,  airline : str ) -> list:
    
    '''
    get the review from one page
    
    '''
    
    # get the page
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    review_list= []
    allarticles = soup.findAll('article')
    articles = allarticles[0].findAll('article')


    for one in articles:
        record = {}
        record['airline'] = airline
        try : 
            overallRating = one.find('span',{'itemprop': 'ratingValue'}).text
            if str(overallRating) == 'None':
                overallRating = 0
            else:
                record['ratingValue'] = int(overallRating)

            record['author'] = one.find('span',{'itemprop': 'name'}).text.strip()

            tempdate = one.find('time',{'itemprop': 'datePublished'}).text
            tempdate = re.sub(r'(\d)(st|nd|rd|th)', r'\1', tempdate)
            date_time_obj = datetime.strptime(tempdate, '%d %B %Y')
            record['datePublished'] = date_time_obj

            tmptext = one.find('div',{'class': 'text_content'}).text
            text = tmptext.split('|')
            if len(text)>1:
                textindex=1
                verifyindex=0
            else:
                textindex=0
                verifyindex=-1
            record['reviewText'] = text[textindex].strip()

            if verifyindex!=-1:
                if 'Not' in tmptext.split('|')[verifyindex]:
                    record['verified'] = False
                else:
                    record['verified'] = True
            else:
                record['verified'] = False

            ## 其他細項
            for element in one.find_all('tr'):

                rating_element = element.find_all('td')[0]['class'][1]
                if not (element.find_all('td')[1].text).isnumeric():
                    rating_value = (element.find_all('td')[1]).text
                    record[rating_element] = rating_value.strip()
                else:
                    stars = len(element.find_all('span',{'class': 'star fill'}))
                    record[rating_element] = int(stars)

            review_list.append(record)
        except:
            pass

    return review_list

In [16]:
# 檢查檔案的日期是否
def get_latest_reviews( url : str , airline : str , stop_date : str ) -> list:
    
    review_list = []

    pages = get_pages(url)
    
    more_review = True
    
    for page in range(1,pages+1):
        
        tmp_url = str_+airline+'/page/'+str(page)+'/?sortby=post_date%3ADesc&pagesize=100'
        response = requests.get(tmp_url)
        soup = BeautifulSoup(response.text, "html.parser")
        allarticles = soup.findAll('article')
        articles = allarticles[0].findAll('article')

        for one in articles:
            record = {}
            
            try :

                tempdate = one.find('time',{'itemprop': 'datePublished'}).text
                tempdate = re.sub(r'(\d)(st|nd|rd|th)', r'\1', tempdate)
                date_time_obj = datetime.strptime(tempdate, '%d %B %Y')

                if str(date_time_obj) > stop_date:
                    
                    record['airline'] = airline
                    overallRating = one.find('span',{'itemprop': 'ratingValue'}).text
                    if str(overallRating) == 'None':
                        overallRating = 0
                    else:
                        record['ratingValue'] = int(overallRating)
                    record['author'] = one.find('span',{'itemprop': 'name'}).text.strip()
                    record['datePublished'] = date_time_obj
                    tmptext = one.find('div',{'class': 'text_content'}).text
                    text = tmptext.split('|')
                    if len(text)>1:
                        textindex=1
                        verifyindex=0
                    else:
                        textindex=0
                        verifyindex=-1
                    record['reviewText'] = text[textindex].strip()

                    if verifyindex!=-1:
                        if 'Not' in tmptext.split('|')[verifyindex]:
                            record['verified'] = False
                        else:
                            record['verified'] = True
                    else:
                        record['verified'] = False

                    ## 其他細項
                    for element in one.find_all('tr'):

                        rating_element = element.find_all('td')[0]['class'][1]
                        if not (element.find_all('td')[1].text).isnumeric():
                            rating_value = (element.find_all('td')[1]).text
                            record[rating_element] = rating_value.strip()
                        else:
                            stars = len(element.find_all('span',{'class': 'star fill'}))
                            record[rating_element] = int(stars)
                    review_list.append(record)
                else:
                    more_review = False
                    break
            except:
                pass

        if not more_review:
            return review_list
            break

In [32]:
# 輸入日期格式：年-月-日 e.g. 2023-01-13

def crawl_by_date( date: str):
    
#     review_url_list = get_airlines()
    
    for idx , url in enumerate(review_url_list):
        
        if idx%50==0:
            print(f'Now progress : {idx}')
        
        total_review_list=[]

        # get airline name
        regex = re.compile(r'airline-reviews/(\S*)/page/1/\?sortby=post_date%3ADesc&pagesize=100')
        match = regex.search(url)
        airline = match.group(1)
    
    
        # 有檔案
        try:

            # 讀取檔案資訊與最新評論
            old_df = pd.read_csv('./Reviews/'+airline+'.csv') 
            df_date = old_df.datePublished[0]

            # 檢查是否有指定日期之後的評論 
            if has_new_review(url , date):
                print(f'{airline} 有 {date} 之後的評論！ ')
                new_list = get_latest_reviews( url , airline , df_date )
                new_df = pd.DataFrame(new_list)
                new_df = pd.concat([new_df, old_df])
                new_df.to_csv('./Reviews/'+airline+'.csv', index=False)
            else:
                pass
#                 print(f'{airline} 沒有 {date} 之後的評論！')
                
        
        # 沒有檔案
        except:
            
            str_ = 'https://www.airlinequality.com/airline-reviews/'
            total_review_list = []
            # get pages
            pages = get_pages(url)
            for page in range(1,pages+1):
                tmp_url = str_+airline+'/page/'+str(page)+'/?sortby=post_date%3ADesc&pagesize=100'
                total_review_list += get_review(tmp_url , airline)

            # write to csv
            total_reviews_df = pd.DataFrame(total_review_list)
            total_reviews_df.to_csv('./Reviews/'+airline+'.csv', index=False)

    
   

# Application

In [34]:
df = pd.read_csv('./Reviews/air-france.csv')

In [35]:
df

Unnamed: 0,airline,ratingValue,author,datePublished,reviewText,verified,aircraft,type_of_traveller,cabin_flown,route,date_flown,seat_comfort,cabin_staff_service,food_and_beverages,ground_service,value_for_money,recommended,wifi_and_connectivity,inflight_entertainment
0,air-france,9,A Heale,2023-01-10,Flown with Air France many times over the last...,True,A320,Family Leisure,Economy Class,Edinburgh to Paris,November 2022,4,4,5,5,5,yes,,
1,air-france,10,N Yavonych,2023-01-10,HAM to CDG and return. Two short hops of 70 mi...,True,A220-300 / E190,Solo Leisure,Economy Class,Hamburg to Paris,December 2022,4,5,5,5,5,yes,5,
2,air-france,1,R Marler,2023-01-09,Customer service on the phone were both incomp...,True,,Couple Leisure,Economy Class,Paris to Los Angeles,January 2023,3,4,3,1,1,no,,
3,air-france,2,M Garela,2023-01-08,Our baggage did not make the destination airpo...,True,,Couple Leisure,Economy Class,Barcelona to Prague via Paris,January 2023,3,3,2,1,1,no,2,1
4,air-france,3,K Morton,2022-12-16,"I arrived to Palma de Mallorca, Spain from San...",True,,Solo Leisure,Economy Class,San Francisco to Palma de Mallorca via Paris,September 2022,3,3,4,3,3,no,,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1213,air-france,1,T Price,2013-06-19,BSL-CDG-ATH 28/5/13. Small ATR72 to CDG no han...,False,,,Economy Class,,,3,2,3,,1,no,,1
1214,air-france,2,H Dalton,2013-06-16,2 June 2013 Johannesburg to Paris CDG AF 995. ...,False,,,Business Class,,,4,1,2,,1,no,,2
1215,air-france,8,F Francis,2013-06-16,CDG-JFK. AF-006 2013-05-21. The airline offers...,False,,,Economy Class,,,3,3,5,,4,yes,,3
1216,air-france,8,Y Fall,2013-06-12,Seoul-Paris CDG 777-300ER. Friendly and attent...,False,,,Economy Class,,,3,4,4,,4,yes,,5


In [13]:
review_url_list = get_airlines()

In [31]:
crawl_by_date('2023-01-14')

Now progress : 0
Now progress : 10


KeyboardInterrupt: 