In [1]:
### run when it starts
pip install -q transformers

Note: you may need to restart the kernel to use updated packages.


In [8]:
from transformers import pipeline
import pandas as pd
import sqlalchemy
from dotenv import dotenv_values
from dotenv import load_dotenv
import sql_functions as sf
from sql_functions import get_sql_config, get_data, get_dataframe, get_engine

In [None]:
### need tensorflow > check if its latest
pip install --upgrade tensorflow

In [4]:
import tensorflow as tf
print(tf.reduce_sum(tf.random.normal([1000, 1000])))

tf.Tensor(-699.3589, shape=(), dtype=float32)


### 1. Sentiment Analysis

We tested several models for doing sentiment analysis of the Yelp reviews, in the end we decided to move forward with the one pipeline below. 

In [6]:
#  select the pipeline: https://huggingface.co/mrcaelumn/yelp_restaurant_review_sentiment_analysis

sentiment_pipeline = pipeline(model="mrcaelumn/yelp_restaurant_review_sentiment_analysis")

In [None]:
# query the original review data from server
review_pa = sf.get_dataframe('SELECT * FROM hh_analytics_23_3.yelp_review_pa') 

# 4 columns in the result df
sentiment_result = pd.DataFrame(columns=['business_id', 'review_id', 'text', 'result'])

# loop each row(review)
for index, row in review_pa.iterrows():
    try:
        # get each review info
        review_id = row['review_id']
        business_id = row['business_id']
        text = row['text']

        # processing by the model
        result = sentiment_pipeline(text)

        row_result = pd.DataFrame({
            'business_id': [business_id],
            'review_id': [review_id],
            'text': [text],
            'result': [result],
        })

        sentiment_result = pd.concat([sentiment_result, row_result], ignore_index=True)
# the model can not processing long text (dont know how long), but this will skip the error
    except Exception as e:
        
        print(f"Error processing text: {text}. Error: {e}")
        continue

# write the result in the csv locally
sentiment_result.to_csv('review_pa_sentiment.csv', index=False)
