### Working Environment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/1-GenAI-HandsOn/5-SentimentAnalysis-LLM
!ls

/content/drive/MyDrive/1-GenAI-HandsOn/5-SentimentAnalysis-LLM
amazon_alexa.tsv  amazon_f_handson.ipynb


### Import Dataset

In [3]:
import pandas as pd

data = pd.read_csv('amazon_alexa.tsv', sep='\t')
data.head(10)

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
5,5,31-Jul-18,Heather Gray Fabric,I received the echo as a gift. I needed anothe...,1
6,3,31-Jul-18,Sandstone Fabric,"Without having a cellphone, I cannot use many ...",1
7,5,31-Jul-18,Charcoal Fabric,I think this is the 5th one I've purchased. I'...,1
8,5,30-Jul-18,Heather Gray Fabric,looks great,1
9,5,30-Jul-18,Heather Gray Fabric,Love it! I’ve listened to songs I haven’t hear...,1


In [4]:
mydata = data[['verified_reviews','feedback']]
mydata.columns = ['review','label']

mydata.head()

Unnamed: 0,review,label
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


In [5]:
mydata.value_counts('label')

label
1    2893
0     257
dtype: int64

In [6]:
# Count the occurrences of each label
label_counts = mydata["label"].value_counts()

# Get the number of rows to drop from the majority class
rows_to_drop = label_counts.max() - label_counts.min()

# Drop rows from the majority class randomly
if rows_to_drop > 0:
   data_majority = mydata[mydata["label"] == 1]
   data_balanced = mydata.drop(data_majority.sample(rows_to_drop).index)
else:
   data_balanced = mydata.copy()

# Check the new class balance
print(data_balanced["label"].value_counts())

1    257
0    257
Name: label, dtype: int64


## Data Preprocessing

In [7]:
import re

def clean_text(text):
  # Remove special characters and punctuation
  text = re.sub(r"[^\w\s]", " ", text)

  # Remove single characters
  text = re.sub(r"\b[a-zA-Z]\b", " ", text)

  # Remove HTML tags
  text = re.sub(r"<[^>]*>", " ", text)

  # Lowercase the text
  text = text.lower()

  # Remove extra whitespace
  text = re.sub(r"\s+", " ", text)

  # Trim leading and trailing spaces
  text = text.strip()

  return text

In [8]:
import pandas as pd

# Extract the review column as a list
reviews = data_balanced['review'].tolist()

# Clean the text in the list
cleaned_reviews = [clean_text(review) for review in reviews]

# Add the cleaned reviews as a new column to the DataFrame
data_balanced['clean_reviews'] = cleaned_reviews

In [9]:
data_balanced

Unnamed: 0,review,label,clean_reviews
3,I have had a lot of fun with this thing. My 4 ...,1,have had lot of fun with this thing my 4 yr ol...
4,Music,1,music
30,Still learning all the capabilities...but so f...,1,still learning all the capabilities but so far...
43,Tried to play certain broadway shows like Came...,1,tried to play certain broadway shows like came...
44,Great,1,great
...,...,...,...
3096,The product sounded the same as the emoji spea...,0,the product sounded the same as the emoji spea...
3106,neat tool we enjoy it with the family,1,neat tool we enjoy it with the family
3109,Easy to set up and connect with smart devices....,1,easy to set up and connect with smart devices ...
3115,It is just not as loud as I thought it was goi...,1,it is just not as loud as thought it was going...


## Data Split

In [10]:
import pandas as pd

# Assuming your DataFrame is called "df"
total_rows = len(data_balanced)
test_size = int(total_rows * 0.95)

# Randomly sample train_size rows for the training set
test_set = data_balanced.sample(test_size)

# Get the remaining rows for the test set
train_set = data_balanced.drop(test_set.index)

## Sentiment w/ LLM

### Setting up Gemini API

In [11]:
!pip install -q -U google-generativeai

In [12]:
# Necessary packages
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

# Used to securely store your API key
from google.colab import userdata

In [13]:
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

In [14]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-pro
models/gemini-pro-vision


In [15]:
model = genai.GenerativeModel('gemini-pro')

In [16]:
%%time
response = model.generate_content("What is the meaning of life?")

to_markdown(response.text)

CPU times: user 120 ms, sys: 13 ms, total: 133 ms
Wall time: 9.62 s


> The meaning of life is a multifaceted concept that has been contemplated by philosophers, theologians, and individuals throughout history. There is no one definitive answer, as the meaning of life is a personal and subjective experience. However, some common themes that have emerged in discussions about the meaning of life include:
> 
> 1. **Purpose and Fulfillment:** Many people find meaning in their lives by pursuing goals and activities that bring them a sense of purpose and fulfillment. This could be through personal, professional, or creative endeavors, or by contributing to the well-being of others.
> 
> 2. **Relationships and Connection:** Human relationships and connections with others can be a significant source of meaning. Building strong and supportive relationships, whether with family, friends, or a community, can provide a sense of belonging, love, and shared experiences.
> 
> 3. **Values and Beliefs:** Living in accordance with one's values and beliefs can also contribute to a sense of meaning. This could involve striving for justice, equality, or other ethical ideals, or leading a life guided by spiritual or religious principles.
> 
> 4. **Growth and Learning:** The pursuit of knowledge, personal growth, and learning new skills can provide a sense of fulfillment and purpose. Continuously expanding one's horizons and challenging oneself intellectually can lead to a more meaningful life experience.
> 
> 5. **Contribution and Impact:** Making a positive impact on the world or leaving a legacy can be a source of meaning for many people. This could be through contributions to society, the environment, or future generations. It can involve volunteering, charitable work, or simply being a positive force in the lives of others.
> 
> 6. **Experiences and Moments:** Life experiences, both big and small, can contribute to a sense of meaning. Joyful moments, moments of awe and wonder, or overcoming challenges can all add depth and richness to life, making it more meaningful.
> 
> Ultimately, the meaning of life is a personal journey, and what brings meaning to one person may not be the same for another. It is a question that individuals may reflect on throughout their lives, and the answer may evolve over time.

#### Single API Call

In [17]:
test_set_sample = test_set.sample(20)

test_set_sample['pred_label'] = ''

test_set_sample

Unnamed: 0,review,label,clean_reviews,pred_label
66,Fast response which was amazing. Clear concis...,1,fast response which was amazing clear concise ...,
1820,The speakers on these devices are surprisingly...,1,the speakers on these devices are surprisingly...,
162,"Stopped working after 2 weeks ,didn't follow c...",0,stopped working after 2 weeks didn follow comm...,
835,"I have had for only a week, so I am still lear...",1,have had for only week so am still learning al...,
910,"Love these, great sound... easy to connect an...",1,love these great sound easy to connect and use,
1138,Love my Amazon products,1,love my amazon products,
2312,i do wish the dot could connect to the fire st...,1,do wish the dot could connect to the fire stic...,
1678,No YouTube,0,no youtube,
1240,I haven't figured out how to make or receive c...,0,haven figured out how to make or receive calls...,
1858,"Loved all about it, all I can do with it and i...",1,loved all about it all can do with it and it l...,


In [18]:
# Convert the DataFrame to JSON using the to_json() method

json_data = test_set_sample[['clean_reviews','pred_label']].to_json(orient='records')

# Print the JSON data
print(json_data)

[{"clean_reviews":"fast response which was amazing clear concise answers and sound quality is fantastic am still getting used to alexia and have not usde echo to its full extent","pred_label":""},{"clean_reviews":"the speakers on these devices are surprisingly good the functionality of each echo device is fantastic","pred_label":""},{"clean_reviews":"stopped working after 2 weeks didn follow commands really fun when it was working","pred_label":""},{"clean_reviews":"have had for only week so am still learning all that alexa can do it is learning process love the music can call up any time listen to podcasts and radio stations have just started keeping shopping lists and grocery lists which love don have to search for paper and pencil getting time and temperature are great do not have to wait for weather reports the sound is good am not fussy about woofers tweeters base etc the echo is fine for me this afternoon am going to try dropping in on my son when he gets home going to surprise h

In [19]:
prompt = f"""
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.

```
{json_data}
```
"""

print(prompt)


You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
Customer reviews are provided between three back ticks.
In your output, only return the Json code back as output - which is provided between three backticks.
Your task is to update predicted labels under 'pred_label' in the Json code.
Don't make any changes to Json code format, please.

```
[{"clean_reviews":"fast response which was amazing clear concise answers and sound quality is fantastic am still getting used to alexia and have not usde echo to its full extent","pred_label":""},{"clean_reviews":"the speakers on these devices are surprisingly good the functionality of each echo device is fantastic","pred_label":""},{"clean_reviews":"stopped working after 2 weeks didn follow commands really fun when it was working","pred_label":""},{"clean_reviews":"have had for only week so am still learni

In [20]:
response = model.generate_content(prompt)

print(response.text)

```
[{"clean_reviews":"fast response which was amazing clear concise answers and sound quality is fantastic am still getting used to alexia and have not usde echo to its full extent","pred_label":1},{"clean_reviews":"the speakers on these devices are surprisingly good the functionality of each echo device is fantastic","pred_label":1},{"clean_reviews":"stopped working after 2 weeks didn follow commands really fun when it was working","pred_label":0},{"clean_reviews":"have had for only week so am still learning all that alexa can do it is learning process love the music can call up any time listen to podcasts and radio stations have just started keeping shopping lists and grocery lists which love don have to search for paper and pencil getting time and temperature are great do not have to wait for weather reports the sound is good am not fussy about woofers tweeters base etc the echo is fine for me this afternoon am going to try dropping in on my son when he gets home going to surprise 

In [21]:
import json

# Clean the data by stripping the backticks
json_data = response.text.strip("`")

# Load the cleaned data and convert to DataFrame
data = json.loads(json_data)
df_sample = pd.DataFrame(data)

df_sample

Unnamed: 0,clean_reviews,pred_label
0,fast response which was amazing clear concise ...,1
1,the speakers on these devices are surprisingly...,1
2,stopped working after 2 weeks didn follow comm...,0
3,have had for only week so am still learning al...,1
4,love these great sound easy to connect and use,1
5,love my amazon products,1
6,do wish the dot could connect to the fire stic...,1
7,no youtube,0
8,haven figured out how to make or receive calls...,0
9,loved all about it all can do with it and it l...,1


In [22]:
# prompt: Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_sample['pred_label'] = df_sample['pred_label'].values
test_set_sample

Unnamed: 0,review,label,clean_reviews,pred_label
66,Fast response which was amazing. Clear concis...,1,fast response which was amazing clear concise ...,1
1820,The speakers on these devices are surprisingly...,1,the speakers on these devices are surprisingly...,1
162,"Stopped working after 2 weeks ,didn't follow c...",0,stopped working after 2 weeks didn follow comm...,0
835,"I have had for only a week, so I am still lear...",1,have had for only week so am still learning al...,1
910,"Love these, great sound... easy to connect an...",1,love these great sound easy to connect and use,1
1138,Love my Amazon products,1,love my amazon products,1
2312,i do wish the dot could connect to the fire st...,1,do wish the dot could connect to the fire stic...,1
1678,No YouTube,0,no youtube,0
1240,I haven't figured out how to make or receive c...,0,haven figured out how to make or receive calls...,0
1858,"Loved all about it, all I can do with it and i...",1,loved all about it all can do with it and it l...,1


In [23]:
# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix

y_true = test_set_sample["label"]
y_pred = test_set_sample["pred_label"]

confusion_matrix(y_true, y_pred)

array([[ 9,  0],
       [ 0, 11]])

### OpenAI API Config

In [24]:
!pip install openai==0.27.0

Collecting openai==0.27.0
  Downloading openai-0.27.0-py3-none-any.whl (70 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.1/70.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.15a0 requires tiktoken, which is not installed.[0m[31m
[0mSuccessfully installed openai-0.27.0


In [25]:
import openai
from google.colab import userdata

OPENAI_API_KEY=userdata.get('OPENAI_API_KEY')
openai.api_key  = OPENAI_API_KEY

In [26]:
def get_completion(prompt, model="gpt-3.5-turbo-1106"):

  messages = [{"role": "user", "content": prompt}]
  response = openai.ChatCompletion.create(model=model,messages=messages,temperature=0)

  return response.choices[0].message["content"]

In [27]:
prompt = "Why is the sky blue?"

chatgpt_response = get_completion(prompt)

In [28]:
chatgpt_response

"The sky appears blue to our eyes because of the way the Earth's atmosphere scatters sunlight. The molecules in the Earth's atmosphere, particularly nitrogen and oxygen, scatter shorter wavelengths of light (blue and violet) more effectively than longer wavelengths (red and yellow). This scattering causes the blue light to be more visible and gives the sky its blue color. This effect is known as Rayleigh scattering."

#### Batching API Calls (Single Shot)

In [29]:
test_set.shape

(488, 3)

In [30]:
test_set_total = test_set.sample(100)

test_set_total['pred_label'] = ''

test_set_total

Unnamed: 0,review,label,clean_reviews,pred_label
2786,Very convenient,1,very convenient,
1913,Excellent why didn’t I think it is in the begi...,1,excellent why didn think it is in the beginnin...,
2621,Love it,1,love it,
672,Bought this to go in my niece's room. You can'...,1,bought this to go in my niece room you can tel...,
2225,"The current demand for this stick, was too hig...",1,the current demand for this stick was too high...,
...,...,...,...,...
653,I wanted a white dot for my white bathroom. T...,0,wanted white dot for my white bathroom the top...,
857,"Stopped working after 2 weeks ,didn't follow c...",0,stopped working after 2 weeks didn follow comm...,
2005,Why do we need to buy a $100 hub to get it to ...,0,why do we need to buy 100 hub to get it to wor...,
1342,I am very excited and happy with this. It was ...,1,am very excited and happy with this it was bre...,


In [31]:
batches = []
batch_size = 50

for i in range(0, len(test_set_total), batch_size):
  batches.append(test_set_total[i : i + batch_size])  # Append batches instead of assigning

In [32]:
import time

def gpt_completion_function(batch,current_batch,total_batch,model="gpt-3.5-turbo-1106"):
  """Function works in three steps:
  # Step-1: Convert the DataFrame to JSON using the to_json() method.
  # Step-2: Preparing the Gemini Prompt
  # Step-3: Calling GPT API
  """

  print(f"Now processing batch#: {current_batch+1} of {total_batch}")

  json_data = batch[['clean_reviews','pred_label']].to_json(orient='records')

  prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).

  ```
  {json_data}
  ```
  """

  print(prompt)

  messages = [{"role": "user", "content": prompt}]
  response = openai.ChatCompletion.create(model=model,messages=messages,temperature=0)
  time.sleep(5)
  return response.choices[0].message["content"]

In [33]:
batch_count = len(batches)
responses = []

for i in range(0,len(batches)):
  responses.append(gpt_completion_function(batches[i],i,batch_count))

Now processing batch#: 1 of 2
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).

  ```
  [{"clean_reviews":"very convenient","pred_label":""},{"clean_reviews":"excellent why didn think it is in the beginning love it","pred_label":""},{"clean_reviews":"love it","pred_label":""},{"clean_reviews":"bought this to go in my niece room you can tell it refurbished it looks good and works like new","pred_label":""},{"clean_

In [34]:
import json

df_total = pd.DataFrame()  # Initialize an empty DataFrame

for response in responses:
  # Clean the data by stripping the backticks
  json_data = response.strip("`")

  # Load the cleaned data and convert to DataFrame
  data = json.loads(json_data)
  df_temp = pd.DataFrame(data)

  # Append the DataFrame to the final DataFrame
  df_total = df_total.append(df_temp, ignore_index=True)

print(df_total)  # Display the final DataFrame

                                        clean_reviews  pred_label
0                                     very convenient           1
1   excellent why didn think it is in the beginnin...           1
2                                             love it           1
3   bought this to go in my niece room you can tel...           1
4   the current demand for this stick was too high...           0
..                                                ...         ...
95  wanted white dot for my white bathroom the top...           0
96  stopped working after 2 weeks didn follow comm...           0
97  why do we need to buy 100 hub to get it to wor...           0
98  am very excited and happy with this it was bre...           1
99  in one word amazing best tech purchase have ev...           1

[100 rows x 2 columns]


  df_total = df_total.append(df_temp, ignore_index=True)
  df_total = df_total.append(df_temp, ignore_index=True)


In [35]:
# prompt: Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total

Unnamed: 0,review,label,clean_reviews,pred_label
2786,Very convenient,1,very convenient,1
1913,Excellent why didn’t I think it is in the begi...,1,excellent why didn think it is in the beginnin...,1
2621,Love it,1,love it,1
672,Bought this to go in my niece's room. You can'...,1,bought this to go in my niece room you can tel...,1
2225,"The current demand for this stick, was too hig...",1,the current demand for this stick was too high...,0
...,...,...,...,...
653,I wanted a white dot for my white bathroom. T...,0,wanted white dot for my white bathroom the top...,0
857,"Stopped working after 2 weeks ,didn't follow c...",0,stopped working after 2 weeks didn follow comm...,0
2005,Why do we need to buy a $100 hub to get it to ...,0,why do we need to buy 100 hub to get it to wor...,0
1342,I am very excited and happy with this. It was ...,1,am very excited and happy with this it was bre...,1


In [36]:
# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix, accuracy_score

y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]

print(confusion_matrix(y_true, y_pred))
print(f"\nAccuracy: {accuracy_score(y_true, y_pred)}")

[[46  0]
 [ 9 45]]

Accuracy: 0.91


### Batching API Calls: Gemini API

In [None]:
test_set.shape

In [None]:
test_set_total = test_set.sample(100)

test_set_total['pred_label'] = ''

test_set_total

In [None]:
batches = []
batch_size = 25

for i in range(0, len(test_set_total), batch_size):
  batches.append(test_set_total[i : i + batch_size])  # Append batches instead of assigning

In [None]:
import time

def gemini_completion_function(batch,current_batch,total_batch):
  """Function works in three steps:
  # Step-1: Convert the DataFrame to JSON using the to_json() method.
  # Step-2: Preparing the Gemini Prompt
  # Step-3: Calling Gemini API
  """

  print(f"Now processing batch#: {current_batch+1} of {total_batch}")

  json_data = batch[['clean_reviews','pred_label']].to_json(orient='records')

  prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).

  ```
  {json_data}
  ```
  """

  print(prompt)
  response = model.generate_content(prompt)
  time.sleep(5)

  return response

In [None]:
batch_count = len(batches)
responses = []

for i in range(0,len(batches)):
  responses.append(gemini_completion_function(batches[i],i,batch_count))

In [None]:
import json

df_total = pd.DataFrame()  # Initialize an empty DataFrame

for response in responses:
  # Clean the data by stripping the backticks
  json_data = response.text.strip("`")

  # Load the cleaned data and convert to DataFrame
  data = json.loads(json_data)
  df_temp = pd.DataFrame(data)

  # Append the DataFrame to the final DataFrame
  df_total = df_total.append(df_temp, ignore_index=True)

print(df_total)  # Display the final DataFrame

In [None]:
# prompt: Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total

In [None]:
# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix

y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]

confusion_matrix(y_true, y_pred)

## Batching API Calls: ChatGPT (Few Shot)

In [80]:
test_set.shape

(488, 3)

In [81]:
test_set_total = test_set.sample(100)

test_set_total['pred_label'] = ''

test_set_total

Unnamed: 0,review,label,clean_reviews,pred_label
1267,I switched to google. The amazon helper is mor...,0,switched to google the amazon helper is more f...,
631,Eh. It’s works on and off. Half the time it do...,0,eh it works on and off half the time it does n...,
1742,I have been waiting for the Echo Show to go on...,1,have been waiting for the echo show to go on s...,
1563,It's like having another kid in the house; I h...,0,it like having another kid in the house have t...,
1517,Love my echo show! Great sound and picture. Do...,1,love my echo show great sound and picture does...,
...,...,...,...,...
2560,Love the echo dot it’s amaxing!!!,1,love the echo dot it amaxing,
2013,I bought this for myself and i didn’t realize ...,0,bought this for myself and didn realize it had...,
2520,Best purchase this year.,1,best purchase this year,
1200,Meh,0,meh,


In [82]:
batches = []
batch_size = 50

for i in range(0, len(test_set_total), batch_size):
  batches.append(test_set_total[i : i + batch_size])  # Append batches instead of assigning

In [83]:
import time

def gpt_completion_function(batch,current_batch,total_batch,train_sample,model="gpt-3.5-turbo-1106"):
  """Function works in three steps:
  # Step-1: Convert the DataFrame to JSON using the to_json() method.
  # Step-2: Preparing the Gemini Prompt
  # Step-3: Calling GPT API
  """

  print(f"Now processing batch#: {current_batch+1} of {total_batch}")

  json_data = batch[['clean_reviews','pred_label']].to_json(orient='records')

  sample_json_data = train_sample[['clean_reviews','label']].to_json(orient='records')

  prompt = f"""You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).
  Examples of good Sentiment Analysis Classification are provided between separator ####.
  These examples are for your reference, not to be included in your final output.

  ```
  {json_data}
  ```
  ####
  {sample_json_data}
  ####
  """

  print(prompt)

  messages = [{"role": "user", "content": prompt}]
  response = openai.ChatCompletion.create(model=model,messages=messages,temperature=0)
  time.sleep(5)
  return response.choices[0].message["content"]

In [84]:
train_sample = train_set.sample(4)

batch_count = len(batches)
responses = []

for i in range(0,len(batches)):
  responses.append(gpt_completion_function(batches[i],i,batch_count,train_sample))

Now processing batch#: 1 of 2
You are an expert linguist, who is good at classifying customer review sentiments into Positive/Negative labels.
  Help me classify customer reviews into: Positive(label=1), and Negative(label=0).
  Customer reviews are provided between three backticks below.
  In your output, only return the Json code back as output - which is provided between three backticks.
  Your task is to update predicted labels under 'pred_label' in the Json code.
  Don't make any changes to Json code format, please.
  Error handling instruction: In case a Customer Review violates API policy, please assign it default sentiment as Negative (label=0).
  Examples of good Sentiment Analysis Classification are provided between separator ####.
  These examples are for your reference, not to be included in your final output.

  ```
  [{"clean_reviews":"switched to google the amazon helper is more for shopping and google is the tasks assistant","pred_label":""},{"clean_reviews":"eh it work

In [85]:
import json

df_total = pd.DataFrame()  # Initialize an empty DataFrame

for response in responses:
  # Clean the data by stripping the backticks
  json_data = response.strip("`")

  # Load the cleaned data and convert to DataFrame
  data = json.loads(json_data)
  df_temp = pd.DataFrame(data)

  # Append the DataFrame to the final DataFrame
  df_total = df_total.append(df_temp, ignore_index=True)

print(df_total)  # Display the final DataFrame

                                        clean_reviews  pred_label
0   switched to google the amazon helper is more f...           0
1   eh it works on and off half the time it does n...           0
2   have been waiting for the echo show to go on s...           1
3   it like having another kid in the house have t...           0
4   love my echo show great sound and picture does...           1
..                                                ...         ...
95                       love the echo dot it amaxing           1
96  bought this for myself and didn realize it had...           0
97                            best purchase this year           1
98                                                meh           0
99  so far so good much better sound than my echo dot           1

[100 rows x 2 columns]


  df_total = df_total.append(df_temp, ignore_index=True)
  df_total = df_total.append(df_temp, ignore_index=True)


In [86]:
# prompt: Overwrite pred_label from 'df' into pred_label in 'train_set_sample'

test_set_total['pred_label'] = df_total['pred_label'].values
test_set_total

Unnamed: 0,review,label,clean_reviews,pred_label
1267,I switched to google. The amazon helper is mor...,0,switched to google the amazon helper is more f...,0
631,Eh. It’s works on and off. Half the time it do...,0,eh it works on and off half the time it does n...,0
1742,I have been waiting for the Echo Show to go on...,1,have been waiting for the echo show to go on s...,1
1563,It's like having another kid in the house; I h...,0,it like having another kid in the house have t...,0
1517,Love my echo show! Great sound and picture. Do...,1,love my echo show great sound and picture does...,1
...,...,...,...,...
2560,Love the echo dot it’s amaxing!!!,1,love the echo dot it amaxing,1
2013,I bought this for myself and i didn’t realize ...,0,bought this for myself and didn realize it had...,0
2520,Best purchase this year.,1,best purchase this year,1
1200,Meh,0,meh,0


In [87]:
# Plotting confusion matrix on the predictions

from sklearn.metrics import confusion_matrix, accuracy_score

y_true = test_set_total["label"]
y_pred = test_set_total["pred_label"]

print(confusion_matrix(y_true, y_pred))
print(f"\nAccuracy: {accuracy_score(y_true, y_pred)}")

[[49  0]
 [ 9 42]]

Accuracy: 0.91
