# Installation and Setup

In [22]:
%%capture
#install aimodelshare library
!pip install transformers
!pip install -q tf-models-official
# A dependency of the preprocessing for BERT inputs
!pip install -q -U tensorflow-text

In [23]:
## for deep learning
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K
## for bert language model
import transformers

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [24]:
## for data
import json
import pandas as pd
import numpy as np
## for plotting
import matplotlib.pyplot as plt
import seaborn as sns
## for processing
import re
import nltk
import os
import shutil
import time

# Read Data



In [25]:
## Google Colab- Google Drive connections instructions 

# This step needs to be done for the first time when you're reading something from the Shared Project Folder 
# Please navigate to "Shared with me" on your Gdrive home sceen
# Right click on the "NLP-Group1-FinalProj" (the primary folder for this project) and select "Add shortcut to Drive"
# This way the below code can find a link to the Project drive folder through your own drive.


# Mounting your personal Gdrive to the Colab notebook
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [26]:
# Reading only 500,000 reviews to process easily
groc_data_part = pd.read_csv('drive/My Drive/NLP-Group1-FinalProj/Data/Processed/groc_amz_data_processed.csv.gz', sep=',', compression="gzip")
groc_data_part = groc_data_part.drop(columns=groc_data_part.columns[0])

In [31]:
groc_data_part['review_text'][2002]


'I just love this product-- Cafix Coffee Substitute Crystals . It dissolves completely , has a great taste and is caffeine free. What more could one want?'

In [32]:
groc_data_part['review_text_processed'][2002]

'i love product-- cafix coffee substitute crystals . it dissolves completely , great taste caffeine free. what could one want?'

In [33]:
groc_data_part['review_text_processed_stem'][2002]

'i love product-- cafix coffe substitut crystal . it dissolv complet , great tast caffein free. what could one want?'

In [34]:
groc_data_part['review_text_processed_lemm'][2002]

'i love product-- cafix coffee substitute crystal . it dissolve completely , great taste caffeine free. what could one want?'

In [28]:
groc_data_part[['review_text','review_text_processed','review_text_processed_stem', 'review_text_processed_lemm']].head(50)

Unnamed: 0,review_text,review_text_processed,review_text_processed_stem,review_text_processed_lemm
0,"No sugar, no GMO garbage, no fillers that come...","no sugar, gmo garbage, fillers come store boug...","no sugar, gmo garbage, filler come store bough...","no sugar, gmo garbage, filler come store bough..."
1,"This is my absolute, undisputed favorite tea r...","this absolute, undisputed favorite tea right n...","thi absolute, undisput favorit tea right now. ...","this absolute, undisputed favorite tea right n..."
2,I ordered spongbob slippers and I got John Cen...,i ordered spongbob slippers i got john cena ha...,i order spongbob slipper i got john cena happi...,i ordered spongbob slipper i got john cena hap...
3,The cart is fine and works for the purpose for...,the cart fine works purpose i bought it. (farm...,the cart fine work purpos i bought it. (farmer...,the cart fine work purpose i bought it. (farme...
4,This product by Archer Farms is the best drink...,this product archer farms best drink mix ever....,thi product archer farm best drink mix ever. j...,this product archer farm best drink mix ever. ...
5,Don't buy this item - rip off at this price. ...,"don't buy item - rip price. my bad, mistake. p...","don't buy item - rip price. my bad, mistake. p...","don't buy item - rip price. my bad, mistake. p..."
6,My wife picked some of this up on sale. I usu...,my wife picked sale. i usually drink crystal l...,my wife pick sale. i usual drink crystal light...,my wife picked sale. i usually drink crystal l...
7,I bought these on sale (2 for $4) at my local ...,i bought sale (2 $4confusion local supermarket...,"i bought sale (2 $4confus local supermarket, $...",i bought sale (2 $4confusion local supermarket...
8,I had a martini at a local distillery that use...,i martini local distillery used bit wormwood b...,i martini local distilleri use bit wormwood bi...,i martini local distillery used bit wormwood b...
9,"I bought these to be part of an Xmas gift, so ...","i bought part xmas gift, i needed look nice. t...","i bought part xma gift, i need look nice. they...","i bought part xmas gift, i needed look nice. t..."


# Load Model and Prep

In [7]:
# Path for the saved model in GDrive
saved_model_path = 'drive/My Drive/NLP-Group1-FinalProj/Models/Bert_IMDB_tuned_model2'

In [8]:
# Loadting the trained model
bert_cm2_classifier = tf.saved_model.load(saved_model_path)

In [9]:
# Function to wrap the steps in predicting and producing a dataframe output

def predict_review_sentiment(rev_list):
  
  # Run the model on input and produce a list/array of sentiment scores 
  rev_sentiment_scores = tf.sigmoid(bert_cm2_classifier(tf.constant(rev_list)))

  #Convert into a pd series with name
  rev_sent_series = pd.Series(rev_sentiment_scores.numpy().squeeze(), name='review_sentiment_score')
  rev_series = pd.Series(rev_list.reset_index(drop=True), name='review_text')

  # Compile them to output a dataframe 
  pred_output = pd.concat([rev_series,rev_sent_series],axis=1)

  return pred_output

# Prediction

##Sample check

In [None]:
def print_my_examples(inputs, results):
  result_for_printing = \
    [f'input: {inputs[i]:<30} : score: {results[i][0]:.6f}'
                         for i in range(len(inputs))]
  print(*result_for_printing, sep='\n')
  print()

In [None]:
reloaded_results = tf.sigmoid(bert_cm2_classifier(tf.constant(groc_data_trial.review_text_processed)))

print('Results from the saved model:')
output = print_my_examples(groc_data_trial.review_text_processed, reloaded_results)

Results from the saved model:
input: no sugar, gmo garbage, fillers come store bought extracts. this stuff amazing. i use everything baking cooking even suggested coffee saying lot i normally care flavored coffee! you cannot go wrong this. i've ordered merchant before, customer satisfaction priority service quick, shipped right tracking even! i'll buying gls goods again! i use vanilla! : score: 0.998977
input: this absolute, undisputed favorite tea right now. i love darjeeling, i'm wildly fond lighter, first flush ones delicate. this darjeeling, especially steeped while, good tannic bite. it's bright warm time, pretty much explodes classic 'darjeeling' flavor. it's even remotely delicate, neither hard-edged. it's sort like good-looking men bespoke suits -- strong refined. i use boiling water, steep 4-5 minutes, large mug use one splenda tiny splash milk. then get way, tea, i take world! : score: 0.999460
input: i ordered spongbob slippers i got john cena happy ... son looking forward s

## Trial Data

In [None]:
groc_data_trial = groc_data.head(10)

In [None]:
predict_review_sentiment(groc_data_trial.review_text_processed)

Unnamed: 0,review_text,review_sentiment_score
0,"no sugar, gmo garbage, fillers come store boug...",0.998977
1,"this absolute, undisputed favorite tea right n...",0.99946
2,i ordered spongbob slippers i got john cena ha...,0.00422
3,the cart fine works purpose i bought it. (farm...,0.75719
4,this product archer farms best drink mix ever....,0.999249
5,"don't buy item - rip price. my bad, mistake. p...",0.00156
6,my wife picked sale. i usually drink crystal l...,0.031676
7,i bought sale (2 $4confusion local supermarket...,0.166192
8,i martini local distillery used bit wormwood b...,0.993452
9,"i bought part xmas gift, i needed look nice. t...",0.267561


## On entire data

In [20]:
rev_list = [ 'best product ever',
            'not the best',
            'not worth the money']

In [15]:
rev_list = ['This is my absolute, undisputed favorite tea right now. I love Darjeeling, but Im not wildly fond of the lighter, first flush ones for being too delicate. This Darjeeling, especially when steeped a while, has a good tannic bite. Its bright and warm all at the same time, and pretty much explodes with that classic Darjeeling flavor. Its not even remotely delicate, but neither is it hard-edged. Its sort of like good-looking men in bespoke suits -- strong but refined. I use boiling water, steep for 4-5 minutes, and with a large mug use one Splenda and just a tiny splash of milk. Then get out of my way, because with this tea, I can take on the world!',
            'And theyre pretty nice! One set looks like ice, and you get a set of four thrice. Kind of overpriced.',
            'The cart is fine and works for the purpose for which I bought it. (Farmerss Markets, etc) but it stinks like hell.  Even after having it in the open air for sometime, it still smells.  I made the mistake of putting it in my car and now I cant get the smell out.Other than that its fine for the price.',
            'Not the best chai taste. Pretty grainy. Is OK, at best. Have had better tasting ones.',
            'For six months, I have been struggling with inexplicable symptoms of what I thought was tendonitis.  Turns out that it was very likely gout caused by the heavy metals contamination in all of the GOL Raw Meal and Raw Protein that I was consuming.  I lost months of productivity, spent a lot of time and money attempting to solve my problem, and damaged my health.  I urge EXTREME CAUTION']

In [21]:
tf.sigmoid(bert_cm2_classifier(tf.constant(rev_list)))

<tf.Tensor: shape=(3, 1), dtype=float32, numpy=
array([[0.9836531 ],
       [0.03495288],
       [0.02288014]], dtype=float32)>

In [None]:
column_names = ["review_text","review_sentiment_score"]
groc_data_sentiment = pd.DataFrame(columns = column_names)

batch_num =0
while batch_num < int(len(groc_data.review_text_processed)/100):
  
  if batch_num==0:
    lb,ub = 0, 100
  else:
    lb, ub = lb+100,ub+100

  try:
    batch_sentiment = predict_review_sentiment(groc_data.review_text_processed[lb:ub])
    groc_data_sentiment = groc_data_sentiment.append(batch_sentiment, ignore_index=True).reset_index(drop=True)
  
  finally:
    groc_data_sentiment.to_csv(rolling_file_name)
  
  print(lb,ub)
  batch_num+=1

# Recompile the whole dataset

In [None]:
name_list = ['groc_data_CM2_output_running_20211205-03.44.csv',
'groc_data_CM2_output_running_20211205-03.48.csv',
'groc_data_CM2_output_running_20211205-04.19.csv',
'groc_data_CM2_output_running_20211205-05.01.csv',
'groc_data_CM2_output_running_20211205-05.48.csv',
'groc_data_CM2_output_running_20211205-06.12.csv',
'groc_data_CM2_output_running_20211205-06.28.csv',
'groc_data_CM2_output_running_20211205-07.49.csv',
'groc_data_CM2_output_running_20211205-08.37.csv',
'groc_data_CM2_output_running_20211205-08.38.csv']

In [None]:
li = []

for filename in name_list:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)
    
frame = pd.concat(name_list, axis=0, ignore_index=True)