In [409]:
import requests
import pandas as pd
from datetime import datetime
import google.cloud.logging
import logging
from google.cloud.logging.handlers import CloudLoggingHandler

# Initialize Google Cloud Logging client
client = google.cloud.logging.Client()
client.setup_logging()

# Create a logger
logger = logging.getLogger("app_scraping_logger")
logger.setLevel(logging.INFO)

# Define API parameters
start_date = '2024-05-01'
end_date = start_date  # Ensure this matches the context date
types_list = ['free', 'paid', 'grossing']  # List of app types
country = 'us'
device = 'iphone'
limit = 500


# Mapping of category codes to names
category_mapping = {
    "0": "All",
    "6018": "Books",
    "6000": "Business",
    "6026": "Developer Tools",
    "6017": "Education",
    "6016": "Entertainment",
    "6015": "Finance",
    "6023": "Food & Drink",
    # "6014": "Games",
    "6027": "Graphics & Design",
    "6013": "Health & Fitness",
    "6012": "Lifestyle",
    "6021": "Magazines & Newspapers",
    "6020": "Medical",
    "6011": "Music",
    "6010": "Navigation",
    "6009": "News",
    "6008": "Photo & Video",
    "6007": "Productivity",
    "6006": "Reference",
    "6005": "Social Networking",
    "6024": "Shopping",
    "6004": "Sports",
    "6003": "Travel",
    "6002": "Utilities",
    "6001": "Weather",
    # "7001": "Game Action",
    # "7002": "Game Adventure",
    # "7003": "Game Casual",
    # "7004": "Game Board",
    # "7005": "Game Card",
    # "7006": "Game Casino",
    # "7009": "Game Family",
    # "7011": "Game Music",
    # "7012": "Game Puzzle",
    # "7013": "Game Racing",
    # "7014": "Game Role Playing",
    # "7015": "Game Simulation",
    # "7016": "Game Sports",
    # "7017": "Game Strategy",
    # "7018": "Game Trivia",
    # "7019": "Game Word",
}

# Function to fetch data from the API
def fetch_data(category_code, start_date, end_date, app_type, country, device, limit):
    # Construct the request URL
    url = (f"https://public-api.apptweak.com/api/public/store/charts/top-results/history?"
           f"categories={category_code}&start_date={start_date}&end_date={end_date}&types={app_type}"
           f"&country={country}&device={device}&limit={limit}&offset=0")  # Fixed offset to 0

    headers = {
        "accept": "application/json",
        "x-apptweak-key": api_key
    }

    logger.info(f"Request URL: {url}")  # Log the request URL

    response = requests.get(url, headers=headers)

    # Log response status and body for debugging
    logger.info(f"Response Status: {response.status_code}")
    if response.status_code != 200:
        logger.error(f"Error fetching data: {response.status_code}, {response.text}")
        return None

    return response.json()

# Collect results for each type in the types_list
all_results = []

# Iterate through each category in the category mapping
for category_code, category_name in category_mapping.items():
    for app_type in types_list:  # Iterate through each app type
        data = fetch_data(category_code, start_date, end_date, app_type, country, device, limit)
        if data:
            # Extract the cost-related fields from the metadata section of the response
            request_metadata = data.get('metadata', {}).get('request', {})
            request_cost = request_metadata.get('cost')
            request_max_credit_cost = request_metadata.get('max_credit_cost')

            # Extract the results
            results = data.get('result', {}).get(category_code, {}).get(app_type, [])
            if results:
                # Extract values from the results
                for app in results:
                    app_date = app['date']  # Extract the 'date' field from the response
                    # Enumerate through the values to assign ranks based on their order
                    for rank, value in enumerate(app['value'], start=1):
                        all_results.append({
                            "scraping_url": f"https://public-api.apptweak.com/api/public/store/charts/top-results/history?categories={category_code}&start_date={start_date}&end_date={end_date}&types={app_type}&country={country}&device={device}&limit={limit}&offset=0",
                            "scraping_timestamp": int(datetime.now().timestamp() * 1_000_000),  # Current timestamp
                            "category_code": category_code,
                            "category_name": category_name,  # Use the category name from mapping
                            "context_date": app_date,  # Use the 'date' field from the response as context_date
                            "type": app_type,  # Update type to the current app type
                            "country": country,
                            "device": device,
                            "rank": rank,  # Rank assigned based on the order of appearance
                            "app_store_id": value,  # Include value for clarity
                            "request_cost": request_cost,  # Add request cost
                            "request_max_credit_cost": request_max_credit_cost  # Add max credit cost
                        })
            else:
                logger.warning(f"No results found for category {category_name} and type {app_type}")

# Output the results
if all_results:
    # Create a DataFrame from the results
    df_results = pd.DataFrame(all_results)
    logger.info(f"Scraping completed successfully. Total records: {len(all_results)}")
else:
    logger.warning("No data to upload.")


INFO:app_scraping_logger:Request URL: https://public-api.apptweak.com/api/public/store/charts/top-results/history?categories=0&start_date=2024-05-01&end_date=2024-05-01&types=free&country=us&device=iphone&limit=500&offset=0
INFO:app_scraping_logger:Response Status: 200
INFO:app_scraping_logger:Request URL: https://public-api.apptweak.com/api/public/store/charts/top-results/history?categories=0&start_date=2024-05-01&end_date=2024-05-01&types=paid&country=us&device=iphone&limit=500&offset=0
INFO:app_scraping_logger:Response Status: 200
INFO:app_scraping_logger:Request URL: https://public-api.apptweak.com/api/public/store/charts/top-results/history?categories=0&start_date=2024-05-01&end_date=2024-05-01&types=grossing&country=us&device=iphone&limit=500&offset=0
INFO:app_scraping_logger:Response Status: 200
INFO:app_scraping_logger:Request URL: https://public-api.apptweak.com/api/public/store/charts/top-results/history?categories=6018&start_date=2024-05-01&end_date=2024-05-01&types=free&cou

In [410]:
df_results.head()

Unnamed: 0,scraping_url,scraping_timestamp,category_code,category_name,context_date,type,country,device,rank,app_store_id,request_cost,request_max_credit_cost
0,https://public-api.apptweak.com/api/public/sto...,1727351578366590,0,All,2024-05-01,free,us,iphone,1,1048524688,11,11
1,https://public-api.apptweak.com/api/public/sto...,1727351578366601,0,All,2024-05-01,free,us,iphone,2,482752836,11,11
2,https://public-api.apptweak.com/api/public/sto...,1727351578366606,0,All,2024-05-01,free,us,iphone,3,284815942,11,11
3,https://public-api.apptweak.com/api/public/sto...,1727351578366610,0,All,2024-05-01,free,us,iphone,4,6446901002,11,11
4,https://public-api.apptweak.com/api/public/sto...,1727351578366615,0,All,2024-05-01,free,us,iphone,5,1641486558,11,11


In [411]:
import logging
import pandas as pd
from google.cloud import storage
import google.cloud.logging
from google.cloud.logging.handlers import CloudLoggingHandler

# Setup Google Cloud Logging client and logger
client = google.cloud.logging.Client()
handler = CloudLoggingHandler(client)
logger = logging.getLogger('stackdriver_logger')
logger.setLevel(logging.INFO)
logger.addHandler(handler)

# Define your GCS bucket name
bucket_name = 'top_chart'

# Assume `start_date` and `end_date` come from your API request
# start_date = '2024-09-24'
# end_date = '2024-09-24'

# Define the file name based on start_date and end_date, using underscores
parquet_file_name = f'top_app_chart_{start_date.replace("-", "_")}_to_{end_date.replace("-", "_")}.parquet'

# Initialize the GCS client
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(parquet_file_name)

try:
    # Check if the DataFrame is not empty
    if not df_results.empty:
        # Check if the file already exists for the given date range
        if blob.exists():
            logger.info(f"Overwriting existing file: {parquet_file_name} for date range {start_date} to {end_date}")
        else:
            logger.info(f"Uploading new file: {parquet_file_name} for date range {start_date} to {end_date}")

        # Save the DataFrame to Parquet format in the GCS bucket
        df_results.to_parquet(f'gs://{bucket_name}/{parquet_file_name}', index=False)

        logger.info(f"DataFrame saved to gs://{bucket_name}/{parquet_file_name}")
    else:
        logger.warning("No data to upload.")
except Exception as e:
    logger.error(f"An error occurred: {e}")


INFO:stackdriver_logger:Uploading new file: top_app_chart_2024_05_01_to_2024_05_01.parquet for date range 2024-05-01 to 2024-05-01
INFO:stackdriver_logger:DataFrame saved to gs://top_chart/top_app_chart_2024_05_01_to_2024_05_01.parquet


### Load to BQ

In [414]:
import logging
from google.cloud import bigquery, storage
import google.cloud.logging
from google.cloud.logging.handlers import CloudLoggingHandler

# Initialize Stackdriver Logging client and logger
logging_client = google.cloud.logging.Client()
handler = CloudLoggingHandler(logging_client)
logger = logging.getLogger('stackdriver_logger')
logger.setLevel(logging.INFO)
logger.addHandler(handler)

# Initialize GCS and BigQuery clients
gcs_client = storage.Client()
bq_client = bigquery.Client()

# Define your parameters
project_id = "web-scraping-2024"
dataset_id = "top_chart"
bucket_name = "top_chart"
top_chart_dir_prefix = ""  # You can add a prefix if needed
specific_file_name = ".parquet"  # We're interested in all files with the .parquet extension

def list_parquet_files(bucket_name, prefix):
    """List all Parquet files in a GCS bucket and prefix."""
    try:
        bucket = gcs_client.bucket(bucket_name)
        blobs = bucket.list_blobs(prefix=prefix)
        return [blob.name for blob in blobs if blob.name.endswith('.parquet')]
    except Exception as e:
        logger.error(f"Error listing parquet files: {e}")
        return []

# Load all Parquet files
top_chart_details_files = list_parquet_files(bucket_name, top_chart_dir_prefix)

if not top_chart_details_files:
    logger.warning("No Parquet files found.")
else:
    top_chart_table_ref = bq_client.dataset(dataset_id).table("top_chart")
    top_chart_job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.PARQUET,  # Parquet file format
        autodetect=True
    )

    for file in top_chart_details_files:
        try:
            # Start the load job for each Parquet file
            logger.info(f"Loading file: {file} into BigQuery")
            load_job = bq_client.load_table_from_uri(
                f"gs://{bucket_name}/{file}",
                top_chart_table_ref,
                job_config=top_chart_job_config
            )

            # Wait for the job to complete
            load_job.result()  # This blocks until the job finishes.

            # Log the result for each file
            logger.info(f"Successfully loaded file {file} into {dataset_id}:top_chart.")

        except Exception as e:
            logger.error(f"Error loading file {file}: {e}")

    # Check the result for the entire table
    try:
        apptweak_details_table = bq_client.get_table(top_chart_table_ref)
        logger.info(f"Loaded {apptweak_details_table.num_rows} rows into {dataset_id}:top_chart.")
    except Exception as e:
        logger.error(f"Error fetching table details: {e}")


INFO:stackdriver_logger:Loading file: top_app_chart_2024_05_01_to_2024_05_01.parquet into BigQuery
INFO:stackdriver_logger:Successfully loaded file top_app_chart_2024_05_01_to_2024_05_01.parquet into top_chart:top_chart.
INFO:stackdriver_logger:Loading file: top_app_chart_2024_05_02_to_2024_05_02.parquet into BigQuery
INFO:stackdriver_logger:Successfully loaded file top_app_chart_2024_05_02_to_2024_05_02.parquet into top_chart:top_chart.
INFO:stackdriver_logger:Loading file: top_app_chart_2024_05_03_to_2024_05_03.parquet into BigQuery
INFO:stackdriver_logger:Successfully loaded file top_app_chart_2024_05_03_to_2024_05_03.parquet into top_chart:top_chart.
INFO:stackdriver_logger:Loading file: top_app_chart_2024_05_04_to_2024_05_04.parquet into BigQuery
INFO:stackdriver_logger:Successfully loaded file top_app_chart_2024_05_04_to_2024_05_04.parquet into top_chart:top_chart.
INFO:stackdriver_logger:Loading file: top_app_chart_2024_05_05_to_2024_05_05.parquet into BigQuery
INFO:stackdriver_