In [1]:
from rich import print
import json
from tqdm import tqdm
from dotenv import load_dotenv, find_dotenv
import os
from openai import OpenAI
import cohere
import pandas as pd
from tqdm import tqdm
import time

In [2]:
_ = load_dotenv(find_dotenv())
api_key = os.environ['OPENAI_API_KEY']

# base_url = os.environ['OPENAI_BASE_URL']
co = cohere.Client()

In [3]:
pwd

'/notebooks/ABSA/notebooks/sentiment-notebooks'

In [4]:
example1 = "المحتوى أكثر من رائع"

In [5]:
system_message = """
You will receive a review about educational services, and your task is to classify its sentiment as positive, neutral, or negative.
The review may be in English language, in Arabic language ,or both.
The only allowed output options are [positive, neutral, negative]
"""

In [6]:
user_template = """
In context of sentiment analysis, you have to classify the sentiment of the following review into positive, neutral, or negative.

review:
###
{}
###

output: 
"""

In [7]:
def format_user_message(template, text):
    return template.format(text)


In [8]:
def format_one_example(review):
    return {
        "messages": [
            {
                "role": "system",
                "content": system_message
            },
            {
                "role": "user",
                "content": format_user_message(user_template, review)
            }
        ]
    }

In [9]:
print(format_one_example(example1)['messages'][1]['content'])

In [10]:
# model_name = 'meta-llama/Meta-Llama-3-70B-Instruct'
# model_name = 'meta-llama/Meta-Llama-3-70B-Instruct'
# model_name = 'gpt-4o'
model_name = 'command-r-plus'
def get_completion(review, model_name):
    completion = co.chat(
        model=model_name, preamble=format_one_example(review)['messages'][0]['content'], 
        temperature=0.0, message=format_one_example(review)['messages'][1]['content'],
        chat_history = [{'role': 'SYSTEM', 'message': system_message}]
    )
    
    return completion

In [11]:
def predict_gpt(item, model_name=model_name, interval=1):
    label = get_completion(item, model_name)
    # result = json.loads(completion)
    # pred_labels = [result.get(token, 'O') for token in item['tokens']]
    # pred_labels = [v for k, v in result.items()]
    # organizations = result['Organizations']
    time.sleep(interval)
    return label.text

In [12]:
example1 = "the doctor is very helpful"

In [13]:
print(get_completion(example1, model_name))

In [14]:
train_data_path = '../../data/result/train_df.csv'
test_data_path = '../../data/result/test_df.csv'
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [15]:
train_data.head()

Unnamed: 0,column,text,topic_id,topic_name
0,like,أكثر ما أعجبني: التطبيق العملي المحدث والمشابه...,3,موازنة الجزء العملي مع الجزء النظري
1,improve_course,اقتراحاتي للتحسين: التطبيق والبعد عن التدريس ا...,3,موازنة الجزء العملي مع الجزء النظري
2,like,أكثر ما أعجبني: the content,0,محتوى ومعلومات المقرر
3,improve_course,اقتراحاتي للتحسين: provide more new books,3,موازنة الجزء العملي مع الجزء النظري
4,improve_course,اقتراحاتي للتحسين: زيادة ساعات العملي,8,الوقت و الجدول


In [16]:
sentiment_labels = []

In [17]:
tqdm.pandas()
interval = 6.1
print(len(sentiment_labels))
for t in tqdm(train_data['text'][len(sentiment_labels): ]):
    sentiment_labels.append(predict_gpt(t, model_name=model_name, interval=interval))

100%|██████████| 951/951 [1:42:37<00:00,  6.47s/it]


In [18]:
train_data['sentiment'] = sentiment_labels

In [19]:
train_data.head()

Unnamed: 0,column,text,topic_id,topic_name,sentiment
0,like,أكثر ما أعجبني: التطبيق العملي المحدث والمشابه...,3,موازنة الجزء العملي مع الجزء النظري,positive
1,improve_course,اقتراحاتي للتحسين: التطبيق والبعد عن التدريس ا...,3,موازنة الجزء العملي مع الجزء النظري,neutral
2,like,أكثر ما أعجبني: the content,0,محتوى ومعلومات المقرر,positive
3,improve_course,اقتراحاتي للتحسين: provide more new books,3,موازنة الجزء العملي مع الجزء النظري,neutral
4,improve_course,اقتراحاتي للتحسين: زيادة ساعات العملي,8,الوقت و الجدول,neutral


In [20]:
train_data['sentiment'].unique()

array(['positive', 'neutral', 'negative', 'Positive', 'Negative'],
      dtype=object)

In [22]:
train_data['sentiment'] = train_data['sentiment'].progress_apply(str.capitalize)

100%|██████████| 951/951 [00:00<00:00, 268785.92it/s]


In [24]:
train_data['sentiment'].unique()

array(['Positive', 'Neutral', 'Negative'], dtype=object)

In [25]:
train_data['sentiment'].value_counts() / len(train_data)

Positive    0.557308
Neutral     0.283912
Negative    0.158780
Name: sentiment, dtype: float64

In [27]:
train_save_path = '../../data/result/train_with_sentiment_df_llm_command_r_plus.csv'
# test_save_path = '../../data/result/test_with_sentiment_df_llm_command_r_plus.csv'

if os.path.exists(train_save_path):
    print('The path exists!')
else:
    print(f'Saving to {train_save_path}...')
    train_data.to_csv(train_save_path, index=False)