In [8]:
import pandas as pd
from pandas_gbq import read_gbq


# Define your BigQuery SQL query
query = """
    SELECT * FROM `web-scraping-2024.top_chart.top_chart` LIMIT 10
"""

# Load the data into a Pandas DataFrame
df = read_gbq(query, project_id='web-scraping-2024')
metadata = df['app_store_id'].apply(lambda x: f"https://apps.apple.com/US/app/id{x}")

Downloading: 100%|[32m██████████[0m|


In [9]:
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime
import pandas as pd
import re

# Function to fetch data from the app store page
def fetch_app_store_data(app_store_url):
    try:
        # Send a request to the app store URL
        response = requests.get(app_store_url, allow_redirects=True)

        # If redirection occurs, get the final URL
        scraping_url_redirect = response.url

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the required details
        title_with_rating = soup.find('h1', class_='product-header__title app-header__title').text.strip() if soup.find('h1', class_='product-header__title app-header__title') else None

        # Remove age rating (like "12+") from title
        title = re.sub(r'\s*\d+\+\s*$', '', title_with_rating) if title_with_rating else None

        subtitle = soup.find('h2', class_='product-header__subtitle app-header__subtitle').text.strip() if soup.find('h2', class_='product-header__subtitle app-header__subtitle') else None
        avg_rating = soup.find('figcaption', class_='we-rating-count star-rating__count').text.strip().split(' • ')[0] if soup.find('figcaption', class_='we-rating-count star-rating__count') else None

        # Extract number of ratings and remove the word "Ratings"
        number_of_ratings = (
            soup.find('figcaption', class_='we-rating-count star-rating__count').text.strip().split(' • ')[1].replace(' Ratings', '')
            if soup.find('figcaption', class_='we-rating-count star-rating__count') else None
        )

        developer_name = soup.find('dd', class_='information-list__item__definition').text.strip() if soup.find('dd', class_='information-list__item__definition') else None
        developer_url = soup.find('h2', class_='product-header__identity app-header__identity').find('a')['href'] if soup.find('h2', class_='product-header__identity app-header__identity') else None

        # Extract size from the specific tag by finding the 'Size' <dt>
        size = None
        size_term = soup.find('dt', string='Size')  # Use 'string' instead of 'text'
        if size_term:
            size_definition = size_term.find_next_sibling('dd')  # Get the corresponding <dd>
            size = size_definition.text.strip() if size_definition else None

        return {
            "scraping_url": app_store_url,
            "scraping_timestamp": int(datetime.now().timestamp() * 1_000_000),
            "scraping_url_redirect": scraping_url_redirect,
            "title": title,
            "subtitle": subtitle,
            "avg_rating": float(avg_rating) if avg_rating else None,
            "number_of_ratings": number_of_ratings,  # Only the numeric part
            "developer_name": developer_name,
            "developer_url": developer_url,
            "size": size
        }
    except Exception as e:
        print(f"Error fetching data for {app_store_url}: {e}")
        return None

# Example usage:
# Assuming you have a DataFrame df with the app_store_url and other relevant columns.
scraped_data = []
for index, row in df.iterrows():
    app_store_url = f"https://apps.apple.com/us/app/id{row['app_store_id']}"  # Assuming you build the URL like this

    # Fetch app details
    app_data = fetch_app_store_data(app_store_url)

    if app_data:
        scraped_data.append(app_data)

    # Sleep to avoid overwhelming the server
    time.sleep(1)

# Create a DataFrame from the scraped data
df_scraped = pd.DataFrame(scraped_data)

# Ensure columns are in the specified order
df_scraped = df_scraped[[
    "scraping_url",
    "scraping_timestamp",
    "scraping_url_redirect",
    "title",
    "subtitle",
    "avg_rating",
    "number_of_ratings",
    "developer_name",
    "developer_url",
    "size"
]]


In [10]:
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime
import pandas as pd
import re
from google.cloud import storage
import io

# Initialize GCS client
client = storage.Client()

# Define your bucket name
bucket_name = 'app_metadata_top_chart'
bucket = client.bucket(bucket_name)

# Function to upload Parquet file to GCS
def upload_parquet_to_gcs(df, file_name, bucket):
    # Create an in-memory bytes buffer to save the parquet file
    buffer = io.BytesIO()

    # Write the DataFrame to Parquet format in memory
    df.to_parquet(buffer, index=False)

    # Move the buffer's position to the beginning
    buffer.seek(0)

    # Create a blob (file object) in the bucket
    blob = bucket.blob(file_name)

    # Upload the bytes from the buffer to GCS
    blob.upload_from_file(buffer, content_type='application/octet-stream')

    print(f"Uploaded {file_name} to GCS bucket {bucket_name}")

# Function to fetch data from the app store page
def fetch_app_store_data(app_store_url):
    try:
        # Send a request to the app store URL
        response = requests.get(app_store_url, allow_redirects=True)

        # If redirection occurs, get the final URL
        scraping_url_redirect = response.url

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the required details
        title_with_rating = soup.find('h1', class_='product-header__title app-header__title').text.strip() if soup.find('h1', class_='product-header__title app-header__title') else None

        # Remove age rating (like "12+") from title
        title = re.sub(r'\s*\d+\+\s*$', '', title_with_rating) if title_with_rating else None

        subtitle = soup.find('h2', class_='product-header__subtitle app-header__subtitle').text.strip() if soup.find('h2', class_='product-header__subtitle app-header__subtitle') else None
        avg_rating = soup.find('figcaption', class_='we-rating-count star-rating__count').text.strip().split(' • ')[0] if soup.find('figcaption', class_='we-rating-count star-rating__count') else None

        # Extract number of ratings and remove the word "Ratings"
        number_of_ratings = (
            soup.find('figcaption', class_='we-rating-count star-rating__count').text.strip().split(' • ')[1].replace(' Ratings', '')
            if soup.find('figcaption', class_='we-rating-count star-rating__count') else None
        )

        developer_name = soup.find('dd', class_='information-list__item__definition').text.strip() if soup.find('dd', class_='information-list__item__definition') else None
        developer_url = soup.find('h2', class_='product-header__identity app-header__identity').find('a')['href'] if soup.find('h2', class_='product-header__identity app-header__identity') else None

        # Extract size from the specific tag by finding the 'Size' <dt>
        size = None
        size_term = soup.find('dt', string='Size')  # Use 'string' instead of 'text'
        if size_term:
            size_definition = size_term.find_next_sibling('dd')  # Get the corresponding <dd>
            size = size_definition.text.strip() if size_definition else None

        return {
            "scraping_url": app_store_url,
            "scraping_timestamp": int(datetime.now().timestamp() * 1_000_000),
            "scraping_url_redirect": scraping_url_redirect,
            "title": title,
            "subtitle": subtitle,
            "avg_rating": float(avg_rating) if avg_rating else None,
            "number_of_ratings": number_of_ratings,  # Only the numeric part
            "developer_name": developer_name,
            "developer_url": developer_url,
            "size": size
        }
    except Exception as e:
        print(f"Error fetching data for {app_store_url}: {e}")
        return None

# Example usage:
# Assuming you have a DataFrame df with the app_store_id and other relevant columns.
scraped_data = []
for index, row in df.iterrows():
    app_store_url = f"https://apps.apple.com/us/app/id{row['app_store_id']}"  # Assuming you build the URL like this

    # Fetch app details
    app_data = fetch_app_store_data(app_store_url)

    if app_data:
        # Append to the list
        scraped_data.append(app_data)

        # Create a DataFrame for the single app data
        df_app = pd.DataFrame([app_data])

        # Define the file path using app_store_id and scraping date
        app_id = row['app_store_id']
        scraping_date = datetime.now().strftime('%Y_%m_%d')
        file_name = f"{scraping_date}__{app_id}.parquet"

        # Upload the DataFrame to GCS
        upload_parquet_to_gcs(df_app, file_name, bucket)

    # Sleep to avoid overwhelming the server
    time.sleep(1)

# Optionally, create a DataFrame from all scraped data if needed
if scraped_data:
    df_scraped = pd.DataFrame(scraped_data)

    # Ensure columns are in the specified order
    df_scraped = df_scraped[[
        "scraping_url",
        "scraping_timestamp",
        "scraping_url_redirect",
        "title",
        "subtitle",
        "avg_rating",
        "number_of_ratings",
        "developer_name",
        "developer_url",
        "size"
    ]]



Uploaded 2024_09_25__6446901002.parquet to GCS bucket app_metadata_top_chart
Uploaded 2024_09_25__6448311069.parquet to GCS bucket app_metadata_top_chart
Uploaded 2024_09_25__1641486558.parquet to GCS bucket app_metadata_top_chart
Uploaded 2024_09_25__284815942.parquet to GCS bucket app_metadata_top_chart
Uploaded 2024_09_25__1542571008.parquet to GCS bucket app_metadata_top_chart
Uploaded 2024_09_25__835599320.parquet to GCS bucket app_metadata_top_chart
Uploaded 2024_09_25__310633997.parquet to GCS bucket app_metadata_top_chart
Uploaded 2024_09_25__1500855883.parquet to GCS bucket app_metadata_top_chart
Uploaded 2024_09_25__389801252.parquet to GCS bucket app_metadata_top_chart
Uploaded 2024_09_25__878577184.parquet to GCS bucket app_metadata_top_chart


In [11]:
from google.cloud import bigquery, storage

# Initialize clients
gcs_client = storage.Client()
bq_client = bigquery.Client()

# Define your parameters
project_id = "web-scraping-2024"
dataset_id = "app_metadata"
bucket_name = "app_metadata_top_chart"
app_metadata_dir_prefix = ""

def list_parquet_files(bucket_name, prefix):
    """List all parquet files in a GCS bucket with a given prefix."""
    bucket = gcs_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)
    return [blob.name for blob in blobs if blob.name.endswith('.parquet')]

# Load all parquet files
app_metadata_details_files = list_parquet_files(bucket_name, app_metadata_dir_prefix)

if not app_metadata_details_files:
    print("No files found.")
else:
    app_metadata_table_ref = bq_client.dataset(dataset_id).table("app_metadata")
    app_metadata_job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.PARQUET,  # Parquet file format
        autodetect=True
    )

    for file_name in app_metadata_details_files:
        # Start the load job for each file
        load_job = bq_client.load_table_from_uri(
            f"gs://{bucket_name}/{file_name}",
            app_metadata_table_ref,
            job_config=app_metadata_job_config
        )

        # Wait for the job to complete
        load_job.result()

        # Check the result for each file
        app_metadata_details_table = bq_client.get_table(app_metadata_table_ref)
        print(f"Loaded {app_metadata_details_table.num_rows} rows from {file_name} into {dataset_id}:app_metadata.")



Loaded 1 rows from 2024_09_25__1500855883.parquet into app_metadata:app_metadata.
Loaded 2 rows from 2024_09_25__1542571008.parquet into app_metadata:app_metadata.
Loaded 3 rows from 2024_09_25__1641486558.parquet into app_metadata:app_metadata.
Loaded 4 rows from 2024_09_25__284815942.parquet into app_metadata:app_metadata.
Loaded 5 rows from 2024_09_25__310633997.parquet into app_metadata:app_metadata.
Loaded 6 rows from 2024_09_25__389801252.parquet into app_metadata:app_metadata.
Loaded 7 rows from 2024_09_25__6446901002.parquet into app_metadata:app_metadata.
Loaded 8 rows from 2024_09_25__6448311069.parquet into app_metadata:app_metadata.
Loaded 9 rows from 2024_09_25__835599320.parquet into app_metadata:app_metadata.
Loaded 10 rows from 2024_09_25__878577184.parquet into app_metadata:app_metadata.
