In [None]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import hashlib

def hash_review(review_body):
    return hashlib.md5(review_body.encode()).hexdigest()

def scrape_review_details(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    reviews = []

    review_elements = soup.find_all('article', itemprop='review')

    for element in review_elements:
        review_data = {}

        review_data['verification_status'] = "Verified" if "✅Trip Verified|" in element.get_text() else "Not Verified"
        review_body = element.find('div', class_='text_content').get_text(strip=True).replace("✅Trip Verified|", "").replace("Not Verified|", "").strip()
        review_data['review_body'] = review_body

        published_date = element.find('time', itemprop='datePublished')['datetime']
        review_data['published_date'] = published_date

        review_data['review_hash'] = hash_review(review_body + published_date)  # Unique identifier

        rows = element.find_all('tr')
        for row in rows:
            header = row.find('td', class_='review-rating-header')
            value = row.find('td', class_='review-value')
            if header and value:
                review_data[header.get_text(strip=True)] = value.get_text(strip=True)

        reviews.append(review_data)

    df = pd.DataFrame(reviews)
    return df

def check_for_new_reviews(url, existing_hashes):
    new_reviews_df = scrape_review_details(url)

    # Check for new reviews by comparing hashes
    new_reviews_df = new_reviews_df[~new_reviews_df['review_hash'].isin(existing_hashes)]

    if new_reviews_df.empty:
        print("No new reviews.")
    else:
        print("New reviews found.")
        # Save the new DataFrame as a Parquet file in the data folder
        if not os.path.exists('data'):
            os.makedirs('data')
        new_reviews_df.to_parquet('data/new_reviews.parquet', index=False)

        # Combine the new reviews with the existing ones and save
        combined_hashes = existing_hashes.union(set(new_reviews_df['review_hash']))
        with open('data/existing_hashes.txt', 'w') as file:
            for hash_value in combined_hashes:
                file.write(f"{hash_value}\n")

        return new_reviews_df

# Define the URL
url = 'https://www.airlinequality.com/airline-reviews/british-airways'

# Load existing hashes from a file
if os.path.exists('data/existing_hashes.txt'):
    with open('data/existing_hashes.txt', 'r') as file:
        existing_hashes = set(line.strip() for line in file)
else:
    existing_hashes = set()

# Check for new reviews
new_reviews_df = check_for_new_reviews(url, existing_hashes)

