## Process Data

### Setting-up Environment

In [None]:
import os

PROJECT_ID = "market-mirror-dev"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
BUCKET_NAME = "marke-mirror-dev-data"  # @param {type: "string", placeholder: "[your-bucket-name]", isTemplate: true}
LOCATION = "US"  # @param {type: "string", placeholder: "[your-region]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

if not LOCATION or LOCATION == "[your-region]":
    LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "US")


In [None]:
os.environ['GOOGLE_CLOUD_PROJECT'] = PROJECT_ID
os.environ['GOOGLE_CLOUD_REGION'] = LOCATION

In [None]:
BQ_BRONZE_DATASET = "APP_MARKET_BRONZE" # @param {type: "string", placeholder: "[bronze-dataset]", isTemplate: true}
BQ_SILVER_DATASET = "APP_MARKET_SILVER" # @param {type: "string", placeholder: "[silver-dataset]", isTemplate: true}
BQ_GOLD_DATASET = "APP_MARKET_GOLD" # @param {type: "string", placeholder: "[gold-dataset]", isTemplate: true}

In [None]:
import bigframes.pandas as bpd
import bigframes.bigquery as bbq
from bigframes.ml import llm

bpd.options.bigquery.project = PROJECT_ID
bpd.options.bigquery.location = LOCATION

In [None]:
review_df = bpd.read_gbq(f'{PROJECT_ID}.{BQ_SILVER_DATASET}.T_APP_REVIEWS')

In [None]:
review_df.info()

<class 'bigframes.dataframe.DataFrame'>
Index: 69183 entries, 0 to 69182
Data columns (total 6 columns):
  #  Column       Non-Null Count    Dtype
---  -----------  ----------------  -------
  0  id           4888 non-null     Int64
  1  app_name     69183 non-null    string
  2  app_genre    3018 non-null     string
  3  review_text  42315 non-null    string
  4  rating       4888 non-null     Int64
  5  sentiment    37432 non-null    string
dtypes: Int64(2), string(4)
memory usage: 3874248 bytes


In [None]:
review_df.isna().sum()

id             64295
app_name           0
app_genre      66165
review_text    26868
rating         64295
sentiment      31751
dtype: Int64

In [None]:
review_df_removed_null_review = review_df[~review_df['review_text'].isna()]

In [None]:
review_df_removed_null_review.isna().sum()

id             37427
app_name           0
app_genre      39297
review_text        0
rating         37427
sentiment       4888
dtype: Int64

In [None]:
google_apps_df = bpd.read_gbq(f'{PROJECT_ID}.{BQ_SILVER_DATASET}.T_GOOGLE_APP_DETAILS',
                              columns=['title','genre']).drop_duplicates()

In [None]:
review_df_removed_null_review['join']= review_df_removed_null_review.app_name.str.strip().str.lower()
google_apps_df['join']= google_apps_df.title.str.strip().str.lower()
merged_review_df = bpd.merge(review_df_removed_null_review, google_apps_df, on='join', how='left')

In [None]:
merged_review_df['app_genre'] = merged_review_df['app_genre'].fillna(merged_review_df['genre'])

In [None]:
merged_review_df.isna().sum()

id             37581
app_name           0
app_genre      38535
review_text        0
rating         37581
sentiment       4891
join               0
title          41406
genre          41406
dtype: Int64

In [None]:
genre_list = google_apps_df.genre.drop_duplicates().to_list()

In [None]:
genre_list.extend(merged_review_df[~merged_review_df.app_genre.isna()].app_genre.drop_duplicates().to_list())

In [None]:
genre_list = list(set(genre_list))

In [None]:
len(genre_list)

49

In [None]:
genre_list_concat = ','.join(genre_list)

In [None]:
unique_apps_without_genre = merged_review_df[merged_review_df.app_genre.isna()].app_name.drop_duplicates().to_frame(name='app_name')

In [None]:
unique_apps_without_genre['prompt'] ="""predict which genre the given app belongs to.
app_name = """+ unique_apps_without_genre.app_name + """
Reply with just the genre predicted for the given app name."""

In [None]:
unique_apps_without_genre.shape

(1395, 2)

In [None]:
model = llm.GeminiTextGenerator(model_name='gemini-2.0-flash', connection_name='us.vertex-remote-models')

of the following models: gemini-1.5-pro-preview-0514,
gemini-1.5-flash-preview-0514, gemini-1.5-pro-001, gemini-1.5-pro-002,
gemini-1.5-flash-001, gemini-1.5-flash-002, gemini-2.0-flash-exp,
gemini-2.0-flash-001, gemini-2.0-flash-lite-001. However, model names
can change, and the supported models may be outdated. You should use
this model name only if you are sure that it is supported in BigQuery.


In [None]:
llm_response = model.predict(X=unique_apps_without_genre,
              prompt=["the genre can be any one from the following list - genre_list:"+ genre_list_concat ],
              ground_with_google_search=False,
              output_schema={'app': 'string', 'app_genre': 'string'})


GeminiTextGenerator for Multimodal prompts. GeminiTextGenerator is
known to support the following models for Multimodal prompts:
gemini-1.5-pro-001, gemini-1.5-pro-002, gemini-1.5-flash-001,
gemini-1.5-flash-002, gemini-2.0-flash-exp, gemini-2.0-flash-001,
gemini-2.0-flash-lite-001. If you proceed with 'gemini-2.0-flash', it
might not work as expected or could lead to errors with multimodal
inputs.


`db_dtypes` is a preview feature and subject to change.


In [None]:
llm_response.head(5)

Unnamed: 0,app,app_genre,full_response,status,app_name,prompt
0,,Puzzle,"{""candidates"":[{""avg_logprobs"":-0.072641881612...",,BEST CAR SOUNDS,{'input_0': 'the genre can be any one from the...
1,,Strategy,"{""candidates"":[{""avg_logprobs"":-0.078300393544...",,BeyondMenu Food Delivery,{'input_0': 'the genre can be any one from the...
2,,Strategy,"{""candidates"":[{""avg_logprobs"":-0.126589692555...",,591房屋交易-租屋、中古屋、新建案、實價登錄、別墅透天、公寓套房、捷運、買房賣房行情、房價...,{'input_0': 'the genre can be any one from the...
4,,Strategy,"{""candidates"":[{""avg_logprobs"":-0.103402623763...",,Calorie Counter & Diet Tracker,{'input_0': 'the genre can be any one from the...
5,,Puzzle,"{""candidates"":[{""avg_logprobs"":-0.055890564735...",,Foursquare Swarm: Check In,{'input_0': 'the genre can be any one from the...


In [None]:
llm_response_subset = llm_response[['app_name','app_genre']]
llm_response_subset = llm_response_subset.rename(columns={'app_genre':'app_genre_llm'})

In [None]:
merged_review_df_with_genre = bpd.merge(merged_review_df,llm_response_subset,on='app_name',how='left')


In [None]:
merged_review_df_with_genre['app_genre'] = merged_review_df_with_genre['app_genre'].fillna(merged_review_df_with_genre['app_genre_llm'])

In [None]:
merged_review_df_with_genre.isna().sum()

id               37581
app_name             0
app_genre            0
review_text          0
rating           37581
sentiment         4891
join                 0
title            41406
genre            41406
app_genre_llm     3850
dtype: Int64

In [None]:
merged_review_df_with_genre.drop(columns=['join','title','genre','app_genre_llm']).to_gbq(destination_table=f"{PROJECT_ID}.{BQ_SILVER_DATASET}.T_APP_REVIEWS_CLEANED")

'market-mirror-dev.APP_MARKET_SILVER.T_APP_REVIEWS_CLEANED'

In [None]:
## use genAI to predict app_genre
## add additional column on language of the review_text with GenAI
## update sentiment analysis for the records with null


In [None]:
merged_review_df_with_genre = merged_review_df_with_genre.drop(columns=['join','title','genre','app_genre_llm'])

In [None]:
merged_review_df_with_genre.isna().sum()

id             37581
app_name           0
app_genre          0
review_text        0
rating         37581
sentiment       4891
dtype: Int64

In [None]:
merged_review_df_with_genre.shape

(42472, 6)

In [None]:
merged_review_df_with_genre = merged_review_df_with_genre.reset_index()

In [None]:
merged_review_df_with_genre = merged_review_df_with_genre.drop(columns=['id']).rename(columns={'index':'id'})

In [None]:
merged_review_df_with_genre.to_gbq(destination_table=f"{PROJECT_ID}.{BQ_SILVER_DATASET}.T_APP_REVIEWS_CLEANED", if_exists='replace')

'market-mirror-dev.APP_MARKET_SILVER.T_APP_REVIEWS_CLEANED'

In [None]:
merged_review_df_with_genre.sentiment.value_counts()

sentiment
Positive    24081
Negative     8342
Neutral      5158
Name: count, dtype: Int64

In [None]:
%%bigquery
# create or replace table `market-mirror-dev.APP_MARKET_SILVER.T_APP_REVIEWS_SENTIMENTS`
# AS
# select id, app_name, review_text,
# AI.GENERATE(prompt=>concat('check the sentiment of the user review for the app and give me answer in just one word on whether the review is [Positive   Negative or Neutral app_name: ', app_name, '\r\n review_text : ', review_text ),
# connection_id => 'us.vertex-remote-models',
# endpoint => 'gemini-2.5-flash'
# ).result
#  from `market-mirror-dev.APP_MARKET_SILVER.T_APP_REVIEWS_CLEANED`
# where sentiment is null --and id in (27394, 27294, 41092)
# ;


update `market-mirror-dev.APP_MARKET_SILVER.T_APP_REVIEWS_CLEANED` tgt
set tgt.sentiment = src.result
from `market-mirror-dev.APP_MARKET_SILVER.T_APP_REVIEWS_SENTIMENTS` src
where tgt.id = src.id
and tgt.sentiment is null;
