In [2]:
%reload_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path
project_root = Path('..').resolve()
sys.path.append(str(project_root))

In [3]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import hashlib
from pathlib import Path
from src.paths import RAW_DATA_DIR

In [None]:
# get new data

# create the hash column that will be compared
def create_hash(review_body):
    return hashlib.md5(review_body.encode()).hexdigest()

# scrape the data just as we did before
def scrape_new_reviews(base_url: str, start_page: str, end_page: str) -> pd.DataFrame:
    reviews = []

    for page_num in range(start_page, end_page + 1):
        url = f"{base_url}/page/{page_num}/?sortby=post_date%3ADesc&pagesize=100"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        review_elements = soup.find_all('article', itemprop='review')

        for element in review_elements:
            review_data = {}

            # Define the verification status based on the presence of specific text
            if "✅ Trip Verified" in element.get_text():
                review_data['verification_status'] = "Trip Verified"
            elif "✅ Verified Review" in element.get_text():
                review_data['verification_status'] = "Review Verified"
            else:
                review_data['verification_status'] = "Not Verified"

            # Clean review body text
            review_body = element.find('div', class_='text_content').get_text(strip=True)
            review_body = review_body.replace("✅Trip Verified|", "").replace("✅Verified Review|", "").replace("Not Verified|", "").strip()
            review_data['review_body'] = review_body

            # Extract published date
            published_date = element.find('time', itemprop='datePublished')['datetime']
            review_data['published_date'] = published_date

            review_data['hash'] = review_data['review_body'].apply(create_hash)

            # Extract rating
            rating_element = element.find('div', itemprop='reviewRating')
            if rating_element:
                rating_value = rating_element.find('span', itemprop='ratingValue').get_text(strip=True)
                best_rating = rating_element.find('span', itemprop='bestRating').get_text(strip=True)
                review_data['rating'] = f"{rating_value}/{best_rating}"

            # Extract additional data
            rows = element.find_all('tr')
            for row in rows:
                header = row.find('td', class_='review-rating-header')
                value = row.find('td', class_='review-value')
                if header and value:
                    review_data[header.get_text(strip=True)] = value.get_text(strip=True)

            reviews.append(review_data)

  

def check_for_new_reviews(url, existing_hashes):
    new_reviews_df = scrape_new_reviews(url, 1, 3)

    # Check for new reviews by comparing hashes
    new_reviews_df = new_reviews_df[~new_reviews_df['review_hash'].isin(existing_hashes)]

    if new_reviews_df.empty:
        print("No new reviews.")
    else:
        print("New reviews found.")
        # Save the new DataFrame as a Parquet file in the data folder
        path = RAW_DATA_DIR / f'{new_reviews_df['published_date'].iloc[0]}.parquet'
        new_reviews_df.to_parquet(path, index=False)

        # Combine the new reviews with the existing ones and save
        combined_hashes = existing_hashes.union(set(new_reviews_df['review_hash']))
        with open('data/existing_hashes.txt', 'w') as file:
            for hash_value in combined_hashes:
                file.write(f"{hash_value}\n")

        return new_reviews_df

# Define the URL
url = 'https://www.airlinequality.com/airline-reviews/british-airways'
start_page = 0
end_page = 1

# Load existing hashes from a file
if os.path.exists('data/existing_hashes.txt'):
    with open('data/existing_hashes.txt', 'r') as file:
        existing_hashes = set(line.strip() for line in file)
else:
    existing_hashes = set()

# Check for new reviews
new_reviews_df = check_for_new_reviews(url, existing_hashes)

