# Installation and Setup

In [1]:
%%capture
#install aimodelshare library
!pip install transformers
!pip install -q tf-models-official
# A dependency of the preprocessing for BERT inputs
!pip install -q -U tensorflow-text

In [2]:
## for deep learning
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K
## for bert language model
import transformers

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [3]:
## for data
import json
import pandas as pd
import numpy as np
## for plotting
import matplotlib.pyplot as plt
import seaborn as sns
## for processing
import re
import nltk
import os
import shutil
import time

# Read Data



In [4]:
## Google Colab- Google Drive connections instructions 

# This step needs to be done for the first time when you're reading something from the Shared Project Folder 
# Please navigate to "Shared with me" on your Gdrive home sceen
# Right click on the "NLP-Group1-FinalProj" (the primary folder for this project) and select "Add shortcut to Drive"
# This way the below code can find a link to the Project drive folder through your own drive.


# Mounting your personal Gdrive to the Colab notebook
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [5]:
# Reading only 500,000 reviews to process easily
groc_data_part = pd.read_csv('drive/My Drive/NLP-Group1-FinalProj/Data/Processed/groc_amz_data_processed.csv.gz', sep=',', compression="gzip")
groc_data_part = groc_data_part.drop(columns=groc_data_part.columns[0])

# Load Model and Prep

In [7]:
# Path for the saved model in GDrive
saved_model_path = 'drive/My Drive/NLP-Group1-FinalProj/Models/Bert_IMDB_tuned_model2'

In [8]:
# Loadting the trained model
bert_cm2_classifier = tf.saved_model.load(saved_model_path)

In [9]:
# Function to wrap the steps in predicting and producing a dataframe output

def predict_review_sentiment(rev_list):
  
  # Run the model on input and produce a list/array of sentiment scores 
  rev_sentiment_scores = tf.sigmoid(bert_cm2_classifier(tf.constant(rev_list)))

  #Convert into a pd series with name
  rev_sent_series = pd.Series(rev_sentiment_scores.numpy().squeeze(), name='review_sentiment_score')
  rev_series = pd.Series(rev_list.reset_index(drop=True), name='review_text')

  # Compile them to output a dataframe 
  pred_output = pd.concat([rev_series,rev_sent_series],axis=1)

  return pred_output

# Prediction

##Sample check

In [None]:
def print_my_examples(inputs, results):
  result_for_printing = \
    [f'input: {inputs[i]:<30} : score: {results[i][0]:.6f}'
                         for i in range(len(inputs))]
  print(*result_for_printing, sep='\n')
  print()

In [None]:
reloaded_results = tf.sigmoid(bert_cm2_classifier(tf.constant(groc_data_trial.review_text_processed)))

print('Results from the saved model:')
output = print_my_examples(groc_data_trial.review_text_processed, reloaded_results)

Results from the saved model:
input: no sugar, gmo garbage, fillers come store bought extracts. this stuff amazing. i use everything baking cooking even suggested coffee saying lot i normally care flavored coffee! you cannot go wrong this. i've ordered merchant before, customer satisfaction priority service quick, shipped right tracking even! i'll buying gls goods again! i use vanilla! : score: 0.998977
input: this absolute, undisputed favorite tea right now. i love darjeeling, i'm wildly fond lighter, first flush ones delicate. this darjeeling, especially steeped while, good tannic bite. it's bright warm time, pretty much explodes classic 'darjeeling' flavor. it's even remotely delicate, neither hard-edged. it's sort like good-looking men bespoke suits -- strong refined. i use boiling water, steep 4-5 minutes, large mug use one splenda tiny splash milk. then get way, tea, i take world! : score: 0.999460
input: i ordered spongbob slippers i got john cena happy ... son looking forward s

## Trial Data

In [None]:
groc_data_trial = groc_data.head(10)

In [None]:
predict_review_sentiment(groc_data_trial.review_text_processed)

Unnamed: 0,review_text,review_sentiment_score
0,"no sugar, gmo garbage, fillers come store boug...",0.998977
1,"this absolute, undisputed favorite tea right n...",0.99946
2,i ordered spongbob slippers i got john cena ha...,0.00422
3,the cart fine works purpose i bought it. (farm...,0.75719
4,this product archer farms best drink mix ever....,0.999249
5,"don't buy item - rip price. my bad, mistake. p...",0.00156
6,my wife picked sale. i usually drink crystal l...,0.031676
7,i bought sale (2 $4confusion local supermarket...,0.166192
8,i martini local distillery used bit wormwood b...,0.993452
9,"i bought part xmas gift, i needed look nice. t...",0.267561


## On entire data

In [None]:
column_names = ["review_text","review_sentiment_score"]
groc_data_sentiment = pd.DataFrame(columns = column_names)

batch_num =0
while batch_num < int(len(groc_data.review_text_processed)/100):
  
  if batch_num==0:
    lb,ub = 0, 100
  else:
    lb, ub = lb+100,ub+100

  try:
    batch_sentiment = predict_review_sentiment(groc_data.review_text_processed[lb:ub])
    groc_data_sentiment = groc_data_sentiment.append(batch_sentiment, ignore_index=True).reset_index(drop=True)
  
  finally:
    groc_data_sentiment.to_csv(rolling_file_name)
  
  print(lb,ub)
  batch_num+=1

# Recompile the whole dataset

In [90]:
name_list = ['groc_data_CM2_output_running_20211205-03.44.csv',
'groc_data_CM2_output_running_20211205-03.48.csv',
'groc_data_CM2_output_running_20211205-04.19.csv',
'groc_data_CM2_output_running_20211205-05.01.csv',
'groc_data_CM2_output_running_20211205-05.48.csv',
'groc_data_CM2_output_running_20211205-06.12.csv',
'groc_data_CM2_output_running_20211205-06.28.csv',
'groc_data_CM2_output_running_20211205-07.49.csv',
'groc_data_CM2_output_running_20211205-08.37.csv',
'groc_data_CM2_output_running_20211205-08.38.csv']

In [None]:
li = []

for filename in name_list:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)
    
frame = pd.concat(name_list, axis=0, ignore_index=True)