## **Loading Data to BigQuery**


<table align="left">
  <td style="text-align: center">
    <a href="https://github.com/smvinodkumar910/market-mirror/blob/dev/load_data.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fvision%2Fgetting-started%2Fimagen3_customization.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/smvinodkumar910/market-mirror/blob/dev/load_data.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://github.com/smvinodkumar910/market-mirror/blob/dev/load_data.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the following cell to authenticate your environment.


In [1]:
import sys

if "google.colab" in sys.modules:
    # Support for third party widgets
    from google.colab import auth, output

    auth.authenticate_user()
    output.enable_custom_widget_manager()



### Setting-up Environment

In [2]:
import os

PROJECT_ID = "market-mirror-dev"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
BUCKET_NAME = "marke-mirror-dev-data"  # @param {type: "string", placeholder: "[your-bucket-name]", isTemplate: true}
LOCATION = "us-central1"  # @param {type: "string", placeholder: "[your-region]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

if not LOCATION or LOCATION == "[your-region]":
    LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")


In [3]:
BQ_BRONZE_DATASET = "APP_MARKET_BRONZE" # @param {type: "string", placeholder: "[bronze-dataset]", isTemplate: true}
BQ_SILVER_DATASET = "APP_MARKET_SILVER" # @param {type: "string", placeholder: "[silver-dataset]", isTemplate: true}
BQ_GOLD_DATASET = "APP_MARKET_GOLD" # @param {type: "string", placeholder: "[gold-dataset]", isTemplate: true}

#### Prepare GCS

In [4]:
from google.cloud import storage
from google.cloud.exceptions import NotFound

gcs_client = storage.Client(
    project=PROJECT_ID
)

try:
  databucket = gcs_client.get_bucket(BUCKET_NAME)
  bucket_exists = True
except NotFound:
  databucket = gcs_client.create_bucket(BUCKET_NAME, project=PROJECT_ID)
  bucket_exists = True

#### Prepare BigQuery Datasets

In [5]:
from google.cloud import bigquery

bq_client = bigquery.Client(project=PROJECT_ID)

bq_client.create_dataset(BQ_BRONZE_DATASET,exists_ok=True)
bq_client.create_dataset(BQ_SILVER_DATASET,exists_ok=True)
bq_client.create_dataset(BQ_GOLD_DATASET,exists_ok=True)

Dataset(DatasetReference('market-mirror-dev', 'APP_MARKET_GOLD'))

### Data Load Steps

#### Data Definitions

We are going to use 4 Kaggle Datasets for this project.

1. https://www.kaggle.com/datasets/pratyushpuri/multilingual-mobile-app-reviews-dataset-2025
2. https://www.kaggle.com/datasets/lava18/google-play-store-apps
3. https://www.kaggle.com/datasets/ramamet4/app-store-apple-data-set-10k-apps
4. https://www.kaggle.com/datasets/quadeer15sh/mwindows-store-top-apps-games

#### Download Data from Kaggle

In [6]:
import kagglehub
import os, glob

product_datasets_list = [
'https://www.kaggle.com/datasets/maryamsayagh1/google-play-store-apps',
'https://www.kaggle.com/datasets/ramamet4/app-store-apple-data-set-10k-apps',
'https://www.kaggle.com/datasets/quadeer15sh/windows-store-top-apps-games']


reviews_datasets_list = ['https://www.kaggle.com/datasets/lava18/google-play-store-apps',
'https://www.kaggle.com/datasets/marianna13/google-play-reviews']

In [7]:
review_local_paths = []
for dataset in reviews_datasets_list:
  dataset_path = dataset.replace('https://www.kaggle.com/datasets/','')
  dataset_name = dataset.split('/')[-1]
  path = kagglehub.dataset_download(dataset_path)
  files_path = glob.glob(pathname=os.path.join(path,'*'))
  review_local_paths.append({'dataset_name': dataset_name, 'path':files_path})

In [8]:
product_local_paths = []
for dataset in product_datasets_list:
  dataset_path = dataset.replace('https://www.kaggle.com/datasets/','')
  dataset_name = dataset.split('/')[-1]
  path = kagglehub.dataset_download(dataset_path)
  files_path = glob.glob(pathname=os.path.join(path,'*'))
  product_local_paths.append({'dataset_name': dataset_name, 'path':files_path})

#### Upload Data to GCS

In [9]:
#Uploading reivew datasets
review_gcs_files = []
if bucket_exists:
  for file in review_local_paths:
    dataset_name = file.get('dataset_name')
    paths = file.get('path')
    for path in paths:
      file_name = path.split('/')[-1]
      destination_blob_name = os.path.join('review_dataset',dataset_name,file_name)
      destination_blob = databucket.blob(destination_blob_name)
      destination_blob.upload_from_filename(path)
      review_gcs_files.append(f"gs://{BUCKET_NAME}/{destination_blob_name}")
      print(
          f"File {path} uploaded to gs://{BUCKET_NAME}/{destination_blob_name}."
      )

File /root/.cache/kagglehub/datasets/lava18/google-play-store-apps/versions/6/googleplaystore_user_reviews.csv uploaded to gs://marke-mirror-dev-data/review_dataset/google-play-store-apps/googleplaystore_user_reviews.csv.
File /root/.cache/kagglehub/datasets/lava18/google-play-store-apps/versions/6/googleplaystore.csv uploaded to gs://marke-mirror-dev-data/review_dataset/google-play-store-apps/googleplaystore.csv.
File /root/.cache/kagglehub/datasets/lava18/google-play-store-apps/versions/6/license.txt uploaded to gs://marke-mirror-dev-data/review_dataset/google-play-store-apps/license.txt.
File /root/.cache/kagglehub/datasets/marianna13/google-play-reviews/versions/1/google_play_reviews.csv uploaded to gs://marke-mirror-dev-data/review_dataset/google-play-reviews/google_play_reviews.csv.


In [10]:
#Uploading product datasets
product_gcs_files = []
if bucket_exists:
  for file in product_local_paths:
    dataset_name = file.get('dataset_name')
    paths = file.get('path')
    for path in paths:
      file_name = path.split('/')[-1]
      destination_blob_name = os.path.join('product_dataset',dataset_name,file_name)
      destination_blob = databucket.blob(destination_blob_name)
      destination_blob.upload_from_filename(path)
      product_gcs_files.append(f"gs://{BUCKET_NAME}/{destination_blob_name}")
      print(
          f"File {path} uploaded to gs://{BUCKET_NAME}/{destination_blob_name}."
      )

File /root/.cache/kagglehub/datasets/maryamsayagh1/google-play-store-apps/versions/1/cleanapp.csv uploaded to gs://marke-mirror-dev-data/product_dataset/google-play-store-apps/cleanapp.csv.
File /root/.cache/kagglehub/datasets/ramamet4/app-store-apple-data-set-10k-apps/versions/7/AppleStore.csv uploaded to gs://marke-mirror-dev-data/product_dataset/app-store-apple-data-set-10k-apps/AppleStore.csv.
File /root/.cache/kagglehub/datasets/ramamet4/app-store-apple-data-set-10k-apps/versions/7/appleStore_description.csv uploaded to gs://marke-mirror-dev-data/product_dataset/app-store-apple-data-set-10k-apps/appleStore_description.csv.
File /root/.cache/kagglehub/datasets/quadeer15sh/windows-store-top-apps-games/versions/1/windows_store.csv uploaded to gs://marke-mirror-dev-data/product_dataset/windows-store-top-apps-games/windows_store.csv.


#### Write Data to BigQuery Bronze Layer

In [11]:
import bigframes.pandas as bpd

bpd.options.bigquery.project = PROJECT_ID
bpd.options.bigquery.dataset = BQ_BRONZE_DATASET

In [12]:
review_gcs_files_filtered = [{'file_name':file.split('/')[-1].split('.')[0], 'gcs_path': file} for file in review_gcs_files if (file.endswith('.csv')  and 'review' in file.split('/')[-1].split('.')[0] ) ]

In [13]:
review_gcs_files_filtered

[{'file_name': 'googleplaystore_user_reviews',
  'gcs_path': 'gs://marke-mirror-dev-data/review_dataset/google-play-store-apps/googleplaystore_user_reviews.csv'},
 {'file_name': 'googleplaystore',
  'gcs_path': 'gs://marke-mirror-dev-data/review_dataset/google-play-store-apps/googleplaystore.csv'},
 {'file_name': 'google_play_reviews',
  'gcs_path': 'gs://marke-mirror-dev-data/review_dataset/google-play-reviews/google_play_reviews.csv'}]

In [14]:
#start processing files
for file_dtl in review_gcs_files_filtered:
  df = bpd.read_csv(file_dtl.get('gcs_path'))
  df.to_gbq(f'{BQ_BRONZE_DATASET}.{file_dtl.get("file_name")}', if_exists='replace')


  _global_session = bigframes.session.connect(


In [None]:
product_gcs_files

['gs://marke-mirror-dev-data/product_dataset/google-play-store-apps/cleanapp.csv',
 'gs://marke-mirror-dev-data/product_dataset/app-store-apple-data-set-10k-apps/AppleStore.csv',
 'gs://marke-mirror-dev-data/product_dataset/app-store-apple-data-set-10k-apps/appleStore_description.csv',
 'gs://marke-mirror-dev-data/product_dataset/windows-store-top-apps-games/windows_store.csv']

In [None]:
product_gcs_files_filtered = [{'file_name':file.split('/')[-1].split('.')[0], 'gcs_path': file} for file in product_gcs_files if file.endswith('.csv') ]

In [None]:
from google.cloud.dataproc_spark_connect import DataprocSparkSession
from google.cloud.dataproc_v1 import Session


# This will create a default Spark session
spark = DataprocSparkSession.builder.getOrCreate()




██████████████████████████████████████████████████████████▋                     







In [None]:
for file in product_gcs_files_filtered:
  table_name = file.get('file_name')
  print(table_name)
  df = spark.read\
  .option("multiLine", "true")\
  .option("quote", "\"")\
  .option("escape", '"')\
  .csv(file.get('gcs_path'),
        inferSchema=True,
        header=True)
  col_rename = [{f"{column}":f"{column.replace(' ','_').replace('.','_')}"} for column in df.columns]
  all_col_rename = dict()
  for a in col_rename:
    all_col_rename.update(a)
  df = df.withColumnsRenamed(all_col_rename)
  df.write.mode("overwrite").format('bigquery').save(f'{PROJECT_ID}.{BQ_BRONZE_DATASET}.{table_name}')


cleanapp


AppleStore


appleStore_description


windows_store
