In [31]:
import requests
import pandas as pd
from datetime import datetime

# Define API parameters
start_date = '2024-09-24'
end_date = '2024-09-24'  # Ensure this matches the context date
types_list = ['free', 'paid', 'grossing']  # List of app types
country = 'us'
device = 'iphone'
limit = 500
api_key = "5ehXl_TDwSwjWrFN4_yaoYu4SGg"

# Mapping of category codes to names
category_mapping = {
    "0": "All",
    "6018": "Books",
    "6000": "Business",
    "6026": "Developer Tools",
    "6017": "Education",
    "6016": "Entertainment",
    "6015": "Finance",
    "6023": "Food & Drink",
    "6014": "Games",
    "6027": "Graphics & Design",
    "6013": "Health & Fitness",
    "6012": "Lifestyle",
    "6021": "Magazines & Newspapers",
    "6020": "Medical",
    "6011": "Music",
    "6010": "Navigation",
    "6009": "News",
    "6008": "Photo & Video",
    "6007": "Productivity",
    "6006": "Reference",
    "6005": "Social Networking",
    "6024": "Shopping",
    "6004": "Sports",
    "6003": "Travel",
    "6002": "Utilities",
    "6001": "Weather",
    "7001": "Game Action",
    "7002": "Game Adventure",
    "7003": "Game Casual",
    "7004": "Game Board",
    "7005": "Game Card",
    "7006": "Game Casino",
    "7009": "Game Family",
    "7011": "Game Music",
    "7012": "Game Puzzle",
    "7013": "Game Racing",
    "7014": "Game Role Playing",
    "7015": "Game Simulation",
    "7016": "Game Sports",
    "7017": "Game Strategy",
    "7018": "Game Trivia",
    "7019": "Game Word",
}

# Function to fetch data from the API
def fetch_data(category_code, start_date, end_date, app_type, country, device, limit):
    # Construct the request URL
    url = (f"https://public-api.apptweak.com/api/public/store/charts/top-results/history?"
           f"categories={category_code}&start_date={start_date}&end_date={end_date}&types={app_type}"
           f"&country={country}&device={device}&limit={limit}&offset=0")  # Fixed offset to 0

    headers = {
        "accept": "application/json",
        "x-apptweak-key": api_key
    }

    print(f"Request URL: {url}")  # Debugging line
    response = requests.get(url, headers=headers)

    # Print response status and body for debugging
    print(f"Response Status: {response.status_code}")  # Debugging line
    print(f"Response Body: {response.text}")  # Debugging line

    if response.status_code == 200:
        data = response.json()
        # Extract the results from the JSON response
        return data.get('result', {}).get(category_code, {}).get(app_type, [])
    else:
        print(f"Error fetching data: {response.status_code}")
        return []

# Collect results for each type in the types_list
all_results = []

# Iterate through each category in the category mapping
for category_code, category_name in category_mapping.items():
    for app_type in types_list:  # Iterate through each app type
        results = fetch_data(category_code, start_date, end_date, app_type, country, device, limit)
        if results:
            # Extract values from the results
            for app in results:
                app_date = app['date']  # Extract the 'date' field from the response
                # Enumerate through the values to assign ranks based on their order
                for rank, value in enumerate(app['value'], start=1):
                    all_results.append({
                        "scraping_url": f"https://public-api.apptweak.com/api/public/store/charts/top-results/history?categories={category_code}&start_date={start_date}&end_date={end_date}&types={app_type}&country={country}&device={device}&limit={limit}&offset=0",
                        "scraping_timestamp": datetime.now().isoformat(),  # Current timestamp
                        "category_code": category_code,
                        "category_name": category_name,  # Use the category name from mapping
                        "context_date": app_date,  # Use the 'date' field from the response as context_date
                        "type": app_type,  # Update type to the current app type
                        "country": country,
                        "device": device,
                        "rank": rank,  # Rank assigned based on the order of appearance
                        "app_store_id": value  # Include value for clarity
                    })

# Output the results
if all_results:
    # Create a DataFrame from the results
    df_results = pd.DataFrame(all_results)
else:
    print("No data to upload.")


Request URL: https://public-api.apptweak.com/api/public/store/charts/top-results/history?categories=0&start_date=2024-09-24&end_date=2024-09-24&types=free&country=us&device=iphone&limit=500&offset=0
Response Status: 200
Response Body: {"result":{"0":{"free":[{"date":"2024-09-24","value":[6446901002,6448311069,1641486558,284815942,1542571008,835599320,310633997,1500855883,389801252,878577184,422689480,585027354,6474455074,544007664,1498607143,6464002625,284882215,309735670,922103212,317469184,686449807,711923939,983156458,1064216828,297606951,719972451,1438166219,1534704608,1193350206,324684580,338137227,384830320,874139669,447188370,1666653815,500003565,368677368,985746746,363590051,535886823,1223471316,1096918571,429047995,283646709,1113153706,6446788829,1362013798,1446075923,482066631,842842640,333903271,407558537,570060128,951937596,351727428,507874739,454638411,897446215,288429040,962194608,546505307,1508186374,401626263,376510438,1260755201,6449206831,545519333,1636235979,53016816

In [21]:
df_results.head()

Unnamed: 0,scraping_url,scraping_timestamp,category_code,category_name,context_date,type,country,device,rank,app_store_id
0,https://public-api.apptweak.com/api/public/sto...,2024-09-24T14:54:15.392718,0,All,2021-12-15,free,us,iphone,1,835599320
1,https://public-api.apptweak.com/api/public/sto...,2024-09-24T14:54:15.392736,0,All,2021-12-15,free,us,iphone,2,544007664
2,https://public-api.apptweak.com/api/public/sto...,2024-09-24T14:54:15.392739,0,All,2021-12-15,free,us,iphone,3,389801252
3,https://public-api.apptweak.com/api/public/sto...,2024-09-24T14:54:15.392743,0,All,2021-12-15,free,us,iphone,4,297606951
4,https://public-api.apptweak.com/api/public/sto...,2024-09-24T14:54:15.392746,0,All,2021-12-15,free,us,iphone,5,284882215


In [32]:
import pandas as pd
from google.cloud import storage

# Define your GCS bucket name
bucket_name = 'top_chart'

# Assume `start_date` and `end_date` come from your API request
start_date = '2024-09-24'
end_date = '2024-09-24'

# Define the file name based on start_date and end_date, using underscores
parquet_file_name = f'top_app_chart_{start_date.replace("-", "_")}_to_{end_date.replace("-", "_")}.parquet'

# Initialize the GCS client
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(parquet_file_name)

# Check if the DataFrame is not empty
if not df_results.empty:
    # Check if the file already exists for the given date range
    if blob.exists():
        print(f"Overwriting existing file: {parquet_file_name} for date range {start_date} to {end_date}")
    else:
        print(f"Uploading new file: {parquet_file_name} for date range {start_date} to {end_date}")

    # Save the DataFrame to Parquet format in the GCS bucket
    df_results.to_parquet(f'gs://{bucket_name}/{parquet_file_name}', index=False)

    print(f"DataFrame saved to gs://{bucket_name}/{parquet_file_name}")
else:
    print("No data to upload.")


Overwriting existing file: top_app_chart_2024_09_24_to_2024_09_24.parquet for date range 2024-09-24 to 2024-09-24
DataFrame saved to gs://top_chart/top_app_chart_2024_09_24_to_2024_09_24.parquet


In [38]:
from google.cloud import bigquery, storage

# Initialize clients
gcs_client = storage.Client()
bq_client = bigquery.Client()

# Define your parameters
project_id = "web-scraping-2024"
dataset_id = "top_chart"
bucket_name = "top_chart"
top_chart_dir_prefix = ""
specific_file_name = "top_app_chart_2021_12_15_to_2021_12_15.parquet"

def list_parquet_files(bucket_name, prefix, specific_file):
    """List specific parquet file in a GCS bucket and prefix."""
    bucket = gcs_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)
    return [blob.name for blob in blobs if blob.name.endswith('.parquet') and specific_file in blob.name]

# Load the specific file
top_chart_details_files = list_parquet_files(bucket_name, top_chart_dir_prefix, specific_file_name)

if not top_chart_details_files:
    print("No files found.")
else:
    top_chart_table_ref = bq_client.dataset(dataset_id).table("top_chart")
    top_chart_job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.PARQUET,  # Parquet file format
        autodetect=True
    )

    # Start the load job for the specific file
    load_job = bq_client.load_table_from_uri(
        f"gs://{bucket_name}/{top_chart_details_files[0]}",
        top_chart_table_ref,
        job_config=top_chart_job_config
    )

    # Wait for the job to complete
    load_job.result()

    # Check the result
    apptweak_details_table = bq_client.get_table(top_chart_table_ref)
    print(f"Loaded {apptweak_details_table.num_rows} rows into {dataset_id}:top_chart.")


Loaded 283399 rows into top_chart:top_chart.
