## Process Data

### Setting-up Environment

In [1]:
import os

PROJECT_ID = "market-mirror-dev"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
BUCKET_NAME = "marke-mirror-dev-data"  # @param {type: "string", placeholder: "[your-bucket-name]", isTemplate: true}
LOCATION = "US"  # @param {type: "string", placeholder: "[your-region]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

if not LOCATION or LOCATION == "[your-region]":
    LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "US")


In [2]:
os.environ['GOOGLE_CLOUD_PROJECT'] = PROJECT_ID
os.environ['GOOGLE_CLOUD_REGION'] = LOCATION

In [3]:
BQ_BRONZE_DATASET = "APP_MARKET_BRONZE" # @param {type: "string", placeholder: "[bronze-dataset]", isTemplate: true}
BQ_SILVER_DATASET = "APP_MARKET_SILVER" # @param {type: "string", placeholder: "[silver-dataset]", isTemplate: true}
BQ_GOLD_DATASET = "APP_MARKET_GOLD" # @param {type: "string", placeholder: "[gold-dataset]", isTemplate: true}

In [4]:
import bigframes.pandas as bpd
import bigframes.bigquery as bbq
from bigframes.ml import llm

bpd.options.bigquery.project = PROJECT_ID
bpd.options.bigquery.location = LOCATION

In [5]:
review_df = bpd.read_gbq(f'{PROJECT_ID}.{BQ_SILVER_DATASET}.T_APP_REVIEWS')

In [6]:
review_df.info()

<class 'bigframes.dataframe.DataFrame'>
Index: 69183 entries, 0 to 69182
Data columns (total 6 columns):
  #  Column       Non-Null Count    Dtype
---  -----------  ----------------  -------
  0  id           4888 non-null     Int64
  1  app_name     69183 non-null    string
  2  app_genre    3018 non-null     string
  3  review_text  42315 non-null    string
  4  rating       4888 non-null     Int64
  5  sentiment    37432 non-null    string
dtypes: Int64(2), string(4)
memory usage: 3874248 bytes


In [7]:
review_df.isna().sum()

id             64295
app_name           0
app_genre      66165
review_text    26868
rating         64295
sentiment      31751
dtype: Int64

In [8]:
review_df_removed_null_review = review_df[~review_df['review_text'].isna()]

In [9]:
review_df_removed_null_review.isna().sum()

id             37427
app_name           0
app_genre      39297
review_text        0
rating         37427
sentiment       4888
dtype: Int64

In [10]:
google_apps_df = bpd.read_gbq(f'{PROJECT_ID}.{BQ_SILVER_DATASET}.T_GOOGLE_APP_DETAILS',
                              columns=['title','genre']).drop_duplicates()

In [11]:
review_df_removed_null_review['join']= review_df_removed_null_review.app_name.str.strip().str.lower()
google_apps_df['join']= google_apps_df.title.str.strip().str.lower()
merged_review_df = bpd.merge(review_df_removed_null_review, google_apps_df, on='join', how='left')

In [12]:
merged_review_df['app_genre'] = merged_review_df['app_genre'].fillna(merged_review_df['genre'])

In [13]:
merged_review_df.isna().sum()

id             37581
app_name           0
app_genre      38535
review_text        0
rating         37581
sentiment       4891
join               0
title          41406
genre          41406
dtype: Int64

In [14]:
genre_list = google_apps_df.genre.drop_duplicates().to_list()

In [15]:
genre_list.extend(merged_review_df[~merged_review_df.app_genre.isna()].app_genre.drop_duplicates().to_list())

In [16]:
genre_list = list(set(genre_list))

In [17]:
len(genre_list)

49

In [18]:
genre_list_concat = ','.join(genre_list)

In [19]:
unique_apps_without_genre = merged_review_df[merged_review_df.app_genre.isna()].app_name.drop_duplicates().to_frame(name='app_name')

In [31]:
unique_apps_without_genre['prompt'] ="""predict which genre the given app belongs to.Pick the most possible genre.
app_name : """+ unique_apps_without_genre.app_name + """
Reply with just the genre predicted for the given app name. genre can be any one from the following list - genre_list:""" + genre_list_concat

In [22]:
unique_apps_without_genre[0:5]

Unnamed: 0,app_name,prompt
0,BEST CAR SOUNDS,predict which genre the given app belongs to. ...
1,BeyondMenu Food Delivery,predict which genre the given app belongs to. ...
2,591房屋交易-租屋、中古屋、新建案、實價登錄、別墅透天、公寓套房、捷運、買房賣房行情、房價...,predict which genre the given app belongs to. ...
4,Calorie Counter & Diet Tracker,predict which genre the given app belongs to. ...
5,Foursquare Swarm: Check In,predict which genre the given app belongs to. ...


In [24]:
model = llm.GeminiTextGenerator(model_name='gemini-2.0-flash-001', connection_name='us.vertex-remote-models')

In [32]:
llm_response = model.predict(X=unique_apps_without_genre,
              output_schema={'app': 'string', 'app_genre': 'string'})


`db_dtypes` is a preview feature and subject to change.


In [33]:
llm_response.head()

Unnamed: 0,app,app_genre,full_response,status,app_name,prompt
0,BEST CAR SOUNDS,Auto & Vehicles,"{""candidates"":[{""avg_logprobs"":-0.005561906557...",,BEST CAR SOUNDS,predict which genre the given app belongs to.P...
1,BeyondMenu Food Delivery,Food & Drink,"{""candidates"":[{""avg_logprobs"":-0.000159307383...",,BeyondMenu Food Delivery,predict which genre the given app belongs to.P...
2,591房屋交易-租屋、中古屋、新建案、實價登錄、別墅透天、公寓套房、捷運、買房賣房行情、房價...,House & Home,"{""candidates"":[{""avg_logprobs"":-1.609029255412...",,591房屋交易-租屋、中古屋、新建案、實價登錄、別墅透天、公寓套房、捷運、買房賣房行情、房價...,predict which genre the given app belongs to.P...
4,,Health & Fitness,"{""candidates"":[{""avg_logprobs"":-0.011345310012...",,Calorie Counter & Diet Tracker,predict which genre the given app belongs to.P...
5,Foursquare Swarm: Check In,Social,"{""candidates"":[{""avg_logprobs"":-0.005781240761...",,Foursquare Swarm: Check In,predict which genre the given app belongs to.P...


In [34]:
llm_response_subset = llm_response[['app_name','app_genre']]
llm_response_subset = llm_response_subset.rename(columns={'app_genre':'app_genre_llm'})

In [48]:
merged_review_df_with_genre = bpd.merge(merged_review_df,llm_response_subset,on='app_name',how='left')


In [49]:
merged_review_df_with_genre['app_genre'] = merged_review_df_with_genre['app_genre'].fillna(merged_review_df_with_genre['app_genre_llm'])

In [50]:
merged_review_df_with_genre.isna().sum()

id               37581
app_name             0
app_genre            0
review_text          0
rating           37581
sentiment         4891
join                 0
title            41406
genre            41406
app_genre_llm     3850
dtype: Int64

In [51]:
merged_review_df_with_genre = merged_review_df_with_genre.drop(columns=['rating','join','title','genre','app_genre_llm'])



In [52]:
merged_review_df_with_genre.to_gbq(destination_table=f"{PROJECT_ID}.{BQ_SILVER_DATASET}.T_APP_REVIEWS_CLEANED",if_exists='replace')

'market-mirror-dev.APP_MARKET_SILVER.T_APP_REVIEWS_CLEANED'

In [41]:
## use genAI to predict app_genre
## add additional column on language of the review_text with GenAI
## update sentiment analysis for the records with null


In [53]:
merged_review_df_with_genre.isna().sum()

id             37581
app_name           0
app_genre          0
review_text        0
sentiment       4891
dtype: Int64

In [54]:
merged_review_df_with_genre = merged_review_df_with_genre.reset_index()
merged_review_df_with_genre = merged_review_df_with_genre.drop(columns=['id']).rename(columns={'index':'id'})

In [55]:
merged_review_df_with_genre.to_gbq(destination_table=f"{PROJECT_ID}.{BQ_SILVER_DATASET}.T_APP_REVIEWS_CLEANED", if_exists='replace')

'market-mirror-dev.APP_MARKET_SILVER.T_APP_REVIEWS_CLEANED'

In [47]:
merged_review_df_with_genre.sentiment.value_counts()

sentiment
Positive    24081
Negative     8342
Neutral      5158
Name: count, dtype: Int64

In [56]:
%%bigquery
create or replace table `market-mirror-dev.APP_MARKET_SILVER.T_APP_REVIEWS_SENTIMENTS`
AS
select id, app_name, review_text,
AI.GENERATE(prompt=>concat('check the sentiment of the user review for the app and give me answer in just one word on whether the review is [Positive   Negative or Neutral app_name: ', app_name, '\r\n review_text : ', review_text ),
connection_id => 'us.vertex-remote-models',
endpoint => 'gemini-2.5-flash'
).result
 from `market-mirror-dev.APP_MARKET_SILVER.T_APP_REVIEWS_CLEANED`
where sentiment is null --and id in (27394, 27294, 41092)
;



Query is running:   0%|          |

In [57]:
%%bigquery
update `market-mirror-dev.APP_MARKET_SILVER.T_APP_REVIEWS_CLEANED` tgt
set tgt.sentiment = src.result
from `market-mirror-dev.APP_MARKET_SILVER.T_APP_REVIEWS_SENTIMENTS` src
where tgt.id = src.id
and tgt.sentiment is null;


Query is running:   0%|          |

In [58]:
review_df = bpd.read_gbq(f"{PROJECT_ID}.{BQ_SILVER_DATASET}.T_APP_REVIEWS_CLEANED")

In [59]:
review_df.isna().sum()

id             0
app_name       0
app_genre      0
review_text    0
sentiment      0
dtype: Int64

In [60]:
%%bigquery
update `market-mirror-dev.APP_MARKET_SILVER.T_APP_REVIEWS_CLEANED` set
sentiment='Negative' where sentiment='[Negative]';

Query is running:   0%|          |