In [4]:
from rich import print
import json
from tqdm import tqdm
from dotenv import load_dotenv, find_dotenv
import os
from openai import OpenAI
import pandas as pd
from tqdm import tqdm

In [5]:
_ = load_dotenv(find_dotenv())
api_key = os.environ['OPENAI_API_KEY']

base_url = 'https://integrate.api.nvidia.com/v1'
client = OpenAI()

In [6]:
pwd

'/notebooks/ABSA/notebooks/sentiment-notebooks'

In [7]:
example1 = "المحتوى أكثر من رائع"

In [8]:
system_message = """
You will receive a review about educational services, and your task is to classify its sentiment as positive, neutral, or negative.
The review may be in English language, in Arabic language ,or both.
The only allowed output options are [positive, neutral, negative]
"""

In [9]:
user_template = """
In context of sentiment analysis, you have to classify the sentiment of the following review into positive, neutral, or negative.

review:
###
{}
###

output: 
"""

In [10]:
def format_user_message(template, text):
    return template.format(text)


In [11]:
def format_one_example(review):
    return {
        "messages": [
            {
                "role": "system",
                "content": system_message
            },
            {
                "role": "user",
                "content": format_user_message(user_template, review)
            }
        ]
    }

In [12]:
print(format_one_example(example1))

In [14]:
model_name = 'gpt-4o'
# model_name = 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo'
def get_completion(review, model_name):
    completion = client.chat.completions.create(
        model=model_name,
        temperature=0,
        messages=format_one_example(review)['messages']
    )
    
    return completion.choices[0].message.content

In [15]:
def predict_gpt(item, model_name=model_name):
    label = get_completion(item, model_name)
    # result = json.loads(completion)
    # pred_labels = [result.get(token, 'O') for token in item['tokens']]
    # pred_labels = [v for k, v in result.items()]
    # organizations = result['Organizations']
    return label

In [16]:
example1 = "المحتوى أكثر من رائع"

In [18]:
# predict_gpt(example1) # positive

In [19]:
train_data_path = '../../data/result/train_df.csv'
test_data_path = '../../data/result/test_df.csv'
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [20]:
train_data.head()

Unnamed: 0,column,text,topic_id,topic_name
0,like,أكثر ما أعجبني: التطبيق العملي المحدث والمشابه...,3,موازنة الجزء العملي مع الجزء النظري
1,improve_course,اقتراحاتي للتحسين: التطبيق والبعد عن التدريس ا...,3,موازنة الجزء العملي مع الجزء النظري
2,like,أكثر ما أعجبني: the content,0,محتوى ومعلومات المقرر
3,improve_course,اقتراحاتي للتحسين: provide more new books,3,موازنة الجزء العملي مع الجزء النظري
4,improve_course,اقتراحاتي للتحسين: زيادة ساعات العملي,8,الوقت و الجدول


In [25]:
tqdm.pandas()

print('Annotate train data...')
train_data['sentiment'] = train_data['text'].progress_apply(predict_gpt)
print('Annotate test data...')
test_data['sentiment'] = test_data['text'].progress_apply(predict_gpt)

100%|██████████| 951/951 [06:09<00:00,  2.57it/s]


100%|██████████| 634/634 [04:17<00:00,  2.46it/s]


In [31]:
train_data['sentiment'] = train_data['sentiment'].progress_apply(str.capitalize)
test_data['sentiment'] = test_data['sentiment'].progress_apply(str.capitalize)

100%|██████████| 951/951 [00:00<00:00, 297293.22it/s]
100%|██████████| 634/634 [00:00<00:00, 464730.64it/s]


In [30]:
test_data['sentiment'].unique()

array(['[positive]', 'Negative', 'Positive', 'Neutral', '[neutral]'],
      dtype=object)

In [29]:
train_data['sentiment'].unique()

array(['Positive', 'Neutral', '[positive]', '[negative]', 'Negative',
       '[neutral]'], dtype=object)

In [32]:
labels_dict = {
    "[positive]": "Positive", "[negative]": "Negative", "[neutral]": "Neutral"
}

In [33]:
train_data['sentiment'] = train_data['sentiment'].progress_apply(lambda x: labels_dict.get(x, x))
test_data['sentiment'] = test_data['sentiment'].progress_apply(lambda x: labels_dict.get(x, x))

100%|██████████| 951/951 [00:00<00:00, 352397.13it/s]
100%|██████████| 634/634 [00:00<00:00, 417717.36it/s]


In [34]:
train_data['sentiment'].unique()

array(['Positive', 'Neutral', 'Negative'], dtype=object)

In [35]:
test_data['sentiment'].unique()

array(['Positive', 'Negative', 'Neutral'], dtype=object)

In [36]:
train_data.head()

Unnamed: 0,column,text,topic_id,topic_name,sentiment
0,like,أكثر ما أعجبني: التطبيق العملي المحدث والمشابه...,3,موازنة الجزء العملي مع الجزء النظري,Positive
1,improve_course,اقتراحاتي للتحسين: التطبيق والبعد عن التدريس ا...,3,موازنة الجزء العملي مع الجزء النظري,Neutral
2,like,أكثر ما أعجبني: the content,0,محتوى ومعلومات المقرر,Positive
3,improve_course,اقتراحاتي للتحسين: provide more new books,3,موازنة الجزء العملي مع الجزء النظري,Neutral
4,improve_course,اقتراحاتي للتحسين: زيادة ساعات العملي,8,الوقت و الجدول,Neutral


In [37]:
train_data['sentiment'].value_counts() / len(train_data)

Positive    0.488959
Neutral     0.275499
Negative    0.235542
Name: sentiment, dtype: float64

In [38]:
test_data['sentiment'].value_counts() / len(test_data)

Positive    0.514196
Neutral     0.264984
Negative    0.220820
Name: sentiment, dtype: float64

In [39]:
train_data.to_csv('../../data/result/train_with_sentiment_df_llm_gpt4o.csv', index=False)
test_data.to_csv('../../data/result/test_with_sentiment_df_llm_gpt4o.csv', index=False)