In [2]:
import pandas as pd

data = pd.read_csv('/content/amazon_alexa.tsv', sep='\t')
data.head(10)

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
5,5,31-Jul-18,Heather Gray Fabric,I received the echo as a gift. I needed anothe...,1
6,3,31-Jul-18,Sandstone Fabric,"Without having a cellphone, I cannot use many ...",1
7,5,31-Jul-18,Charcoal Fabric,I think this is the 5th one I've purchased. I'...,1
8,5,30-Jul-18,Heather Gray Fabric,looks great,1
9,5,30-Jul-18,Heather Gray Fabric,Love it! I’ve listened to songs I haven’t hear...,1


In [3]:
my_data = data[['verified_reviews', 'feedback']]
my_data.columns = ['review', 'label']

my_data.head()

Unnamed: 0,review,label
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


In [4]:
my_data.value_counts('label')

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,2893
0,257


In [5]:
#Count the occurences of each label
label_counts = my_data['label'].value_counts()

rows_to_drop = label_counts.max() - label_counts.min()

if rows_to_drop > 0:
  data_majority = my_data[my_data['label'] == 1]
  data_balanced = my_data.drop(data_majority.sample(rows_to_drop).index)
else:
  data_balanced = my_data.copy()


#Check the new class balance

print(data_balanced['label'].value_counts())

label
1    257
0    257
Name: count, dtype: int64


In [6]:
#Data preprocessing

import re

def clean_text(text):

  text = re.sub(r"[^\w\s]", " ", str(text))

  text = re.sub(r"\b[a-zA-z]\b", " ",str(text))

  text = re.sub(r"<[^>]*>", " ", str(text))

  text = text.lower()

  text = re.sub(r"\s+", " ", str(text))

  text = text.strip()

  return text


In [7]:
reviews = data_balanced['review'].tolist()
clean_reviews = [clean_text(review) for review in reviews]
data_balanced['clean_reviews'] = clean_reviews

In [8]:
data_balanced

Unnamed: 0,review,label,clean_reviews
0,Love my Echo!,1,love my echo
3,I have had a lot of fun with this thing. My 4 ...,1,have had lot of fun with this thing my 4 yr ol...
5,I received the echo as a gift. I needed anothe...,1,received the echo as gift needed another bluet...
13,"Love, Love, Love!!",1,love love love
33,The speakers sound pretty good for being so sm...,1,the speakers sound pretty good for being so sm...
...,...,...,...
3096,The product sounded the same as the emoji spea...,0,the product sounded the same as the emoji spea...
3101,these are additional alexa based devices to ma...,1,these are additional alexa based devices to ma...
3129,We have six of these throughout our home and t...,1,we have six of these throughout our home and t...
3136,I used it to control my smart home devices. Wo...,1,used it to control my smart home devices works...


In [9]:
#Data split

total_rows = len(data_balanced)
test_size = int(total_rows * 0.95)

test_set = data_balanced.sample(test_size)

train_set = data_balanced.drop(test_set.index)

In [10]:
import pathlib
import textwrap
import google.generativeai as genai
from IPython.display import display
from IPython.display import Markdown

In [11]:
def to_markdown(text):
  text = text.replace('*', ' *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _:True))

In [12]:
!pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
Successfully installed openai-0.28.0


In [13]:
import openai
from google.colab import userdata

In [17]:
OPENAI_API_KEY=userdata.get('OPENAI_API_KEY')
openai.api_key = OPENAI_API_KEY

In [18]:
def get_completion(prompt, model="gpt-3.5-turbo-1106"):
  messages = [{"role": "user", "content": prompt}]
  response = openai.ChatCompletion.create(model=model, messages=messages, temperature=0)
  return response.choices[0].message["content"]

In [19]:
prompt = "Why is the sky blue?"

chatgpt_response = get_completion(prompt)

In [None]:
chatgpt_response

"The sky appears blue to our eyes because of the way the Earth's atmosphere scatters sunlight. The molecules in the Earth's atmosphere, particularly nitrogen and oxygen, scatter shorter wavelengths of light (blue and violet) more effectively than longer wavelengths (red and yellow). This scattering causes the blue light to be more visible to us, giving the sky its blue color."

In [20]:
#Batching API calls

test_set.shape

(488, 3)

In [21]:
test_set_total = test_set.sample(100)
test_set_total['pred_label'] = ''

test_set_total

Unnamed: 0,review,label,clean_reviews,pred_label
2250,Like,1,like,
613,All the new Amazon products I have are great e...,0,all the new amazon products have are great eve...,
351,I love the Echo Dot. So easy. So fun. I get to...,1,love the echo dot so easy so fun get to drop i...,
1906,"It does not speak in Spanish,,,I bought it for...",0,it does not speak in spanish bought it for my ...,
1289,All is fine with the Spot exact for one massiv...,0,all is fine with the spot exact for one massiv...,
...,...,...,...,...
1435,"Love, love, LOVE!!!",1,love love love,
2558,"For the price, the product is nice quality and...",0,for the price the product is nice quality and ...,
2815,I am really enjoying the many different featur...,1,am really enjoying the many different features...,
112,i liked the sound . what is troubling is that ...,1,liked the sound what is troubling is that paid...,


In [22]:
batches = []
batch_size = 50

for i in range(0, len(test_set_total), batch_size):
  batches.append(test_set_total[i:i + batch_size])

In [23]:
import time

def gpt_completion_function(batch,current_batch,total_batch,model="gpt-3.5-turbo-1106"):
  print(f"Now processing batch:{current_batch + 1} of {total_batch}")

  json_data = batch[['clean_reviews', 'pred_label']].to_json(orient='records')

  prompt = f""" You are an expert linguist, who is good at classifying customer review sentiments into
  Positive/Negative labels. Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below"
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instructions: In cas a Customer Review violates API policy, please assign it default sentiment as Negative(label=0)

  ```
  {json_data}
  ```
  """

  print(prompt)


  messages = [{"role": "user", "content": prompt}]
  response = openai.ChatCompletion.create(model=model, messages=messages, temperature=0)
  time.sleep(5)
  return response.choices[0].message["content"]




In [24]:
batch_count = len(batches)
responses = []

for i in range(0, len(batches)):
  responses.append(gpt_completion_function(batches[i], i, batch_count))

Now processing batch:1 of 2
 You are an expert linguist, who is good at classifying customer review sentiments into
  Positive/Negative labels. Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below"
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instructions: In cas a Customer Review violates API policy, please assign it default sentiment as Negative(label=0)

  ```
  [{"clean_reviews":"like","pred_label":""},{"clean_reviews":"all the new amazon products have are great everything bought refurbished is complete garbage will never purchase amazon refurbished again total dissatisfaction","pred_label":""},{"clean_reviews":"love the echo dot so easy so fun get to drop in on my grandaughter who lives

In [25]:
import json

df_total = pd.DataFrame()

for response in responses:
  json_data = response.strip("`")

  data = json.loads(json_data)
  df_temp = pd.DataFrame(data)

  df_total = pd.concat([df_total, df_temp], ignore_index=True)

print(df_total)

                                        clean_reviews  pred_label
0                                                like           1
1   all the new amazon products have are great eve...           0
2   love the echo dot so easy so fun get to drop i...           1
3   it does not speak in spanish bought it for my ...           0
4   all is fine with the spot exact for one massiv...           0
..                                                ...         ...
95                                     love love love           1
96  for the price the product is nice quality and ...           0
97  am really enjoying the many different features...           1
98  liked the sound what is troubling is that paid...           0
99  works great to control the tv lights and vario...           1

[100 rows x 2 columns]


In [26]:
#Overwrite pred_label from 'df' into pred_label

test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total

Unnamed: 0,review,label,clean_reviews,pred_label
2250,Like,1,like,1
613,All the new Amazon products I have are great e...,0,all the new amazon products have are great eve...,0
351,I love the Echo Dot. So easy. So fun. I get to...,1,love the echo dot so easy so fun get to drop i...,1
1906,"It does not speak in Spanish,,,I bought it for...",0,it does not speak in spanish bought it for my ...,0
1289,All is fine with the Spot exact for one massiv...,0,all is fine with the spot exact for one massiv...,0
...,...,...,...,...
1435,"Love, love, LOVE!!!",1,love love love,1
2558,"For the price, the product is nice quality and...",0,for the price the product is nice quality and ...,0
2815,I am really enjoying the many different featur...,1,am really enjoying the many different features...,1
112,i liked the sound . what is troubling is that ...,1,liked the sound what is troubling is that paid...,0


In [27]:
from sklearn.metrics import confusion_matrix, accuracy_score

y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]

print(confusion_matrix(y_true, y_pred))
print(f"\nAccuracy: {accuracy_score(y_true, y_pred)}")

[[44  0]
 [ 7 49]]

Accuracy: 0.93
