In [1]:
from rich import print
import json
from tqdm import tqdm
from dotenv import load_dotenv, find_dotenv
import os
from openai import OpenAI
import cohere
import pandas as pd
from tqdm import tqdm
import time

In [2]:
_ = load_dotenv(find_dotenv())
api_key = os.environ['OPENAI_API_KEY']

# base_url = os.environ['OPENAI_BASE_URL']
co = cohere.Client()

In [3]:
pwd

'/notebooks/ABSA-for-Open-Ended-Qs-in-Education-Surveys/notebooks/suggestions_notebooks'

In [4]:
example1 = "المحتوى أكثر من رائع"

In [5]:
system_message = """
You are a helpful assistant designed for data annotation.
"""

In [6]:
user_template = """
I am a data scientist working on a classification problem. The dataset includes responses to open-ended questions from a university survey about education.

Your task is to assign a label to the example I provide.

The possible labels are:

    1. "needs to be added": when the example suggests adding something that is currently missing.
    2. "needs enhancements": when the example mentions something that exists but in low quality, suggesting improvements.
    3. "needs to be removed": when the example refers to something existing but suggests its removal.
    4. "none": otherwise.

Here is the example:

{}

Please provide only the label as the output.
output:
"""

In [7]:
def format_user_message(template, text):
    return template.format(text)


In [8]:
def format_one_example(review):
    return {
        "messages": [
            {
                "role": "system",
                "content": system_message
            },
            {
                "role": "user",
                "content": format_user_message(user_template, review)
            }
        ]
    }

In [9]:
print(format_one_example(example1)['messages'][1]['content'])

In [10]:
# model_name = 'meta-llama/Meta-Llama-3-70B-Instruct'
# model_name = 'meta-llama/Meta-Llama-3-70B-Instruct'
# model_name = 'gpt-4o'
model_name = 'command-r-plus'
def get_completion(review, model_name):
    completion = co.chat(
        model=model_name, preamble=format_one_example(review)['messages'][0]['content'], 
        temperature=0.0, message=format_one_example(review)['messages'][1]['content'],
        chat_history = [{'role': 'SYSTEM', 'message': system_message}]
    )
    
    return completion

In [11]:
def predict_gpt(item, model_name=model_name, interval=1):
    label = get_completion(item, model_name)
    # result = json.loads(completion)
    # pred_labels = [result.get(token, 'O') for token in item['tokens']]
    # pred_labels = [v for k, v in result.items()]
    # organizations = result['Organizations']
    time.sleep(interval)
    return label.text

In [12]:
example1 = "the doctor is very helpful"

In [13]:
print(get_completion(example1, model_name))

In [14]:
train_data_path = '../../data/result/suggestions_train.csv'
test_data_path = '../../data/result/suggestions_test.csv'
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [15]:
test_data.head()

Unnamed: 0,column,text,topic_id,topic_name
0,add_suggestions,اقتراحاتي للإضافة: everything is spectacular a...,2,
1,add_suggestions,اقتراحاتي للإضافة: thank you for everything,2,
2,add_suggestions,اقتراحاتي للإضافة: أتمنى ان يتم تطبيقها بشكل فعال,3,موازنة الجزء العملي مع الجزء النظري
3,add_suggestions,اقتراحاتي للإضافة: ان يتم طليق العملي بشكل مكثف,3,موازنة الجزء العملي مع الجزء النظري
4,add_suggestions,اقتراحاتي للإضافة: change the contents all,0,محتوى ومعلومات المقرر


In [17]:
suggestions_labels = []

In [18]:
tqdm.pandas()
interval = 6.1
print(len(suggestions_labels))
for t in tqdm(test_data['text'][len(suggestions_labels): ]):
    suggestions_labels.append(predict_gpt(t, model_name=model_name, interval=interval))

100%|██████████| 219/219 [23:25<00:00,  6.42s/it]


In [20]:
test_data['suggestions_label'] = suggestions_labels

In [21]:
test_data.head()

Unnamed: 0,column,text,topic_id,topic_name,suggestions_label
0,add_suggestions,اقتراحاتي للإضافة: everything is spectacular a...,2,,none
1,add_suggestions,اقتراحاتي للإضافة: thank you for everything,2,,none
2,add_suggestions,اقتراحاتي للإضافة: أتمنى ان يتم تطبيقها بشكل فعال,3,موازنة الجزء العملي مع الجزء النظري,needs to be added
3,add_suggestions,اقتراحاتي للإضافة: ان يتم طليق العملي بشكل مكثف,3,موازنة الجزء العملي مع الجزء النظري,needs to be added
4,add_suggestions,اقتراحاتي للإضافة: change the contents all,0,محتوى ومعلومات المقرر,needs to be added


In [28]:
# for e in tqdm(train_data['text'].sample(50)):
#     # print(e)
#     print('suggestion:\n', predict_gpt(e))
#     print('----'*25)


In [22]:
test_data.head()

Unnamed: 0,column,text,topic_id,topic_name,suggestions_label
0,add_suggestions,اقتراحاتي للإضافة: everything is spectacular a...,2,,none
1,add_suggestions,اقتراحاتي للإضافة: thank you for everything,2,,none
2,add_suggestions,اقتراحاتي للإضافة: أتمنى ان يتم تطبيقها بشكل فعال,3,موازنة الجزء العملي مع الجزء النظري,needs to be added
3,add_suggestions,اقتراحاتي للإضافة: ان يتم طليق العملي بشكل مكثف,3,موازنة الجزء العملي مع الجزء النظري,needs to be added
4,add_suggestions,اقتراحاتي للإضافة: change the contents all,0,محتوى ومعلومات المقرر,needs to be added


In [23]:
test_data['suggestions_label'].unique()

array(['none', 'needs to be added', 'needs to be removed',
       'needs enhancements'], dtype=object)

In [25]:
test_data['suggestions_label'].value_counts() / len(train_data)

needs to be added      0.332308
needs enhancements     0.200000
none                   0.104615
needs to be removed    0.036923
Name: suggestions_label, dtype: float64

In [27]:
# train_save_path = '../../data/result/train_with_suggestions_df_llm_command_r_plus.csv'
test_save_path = '../../data/result/test_with_suggestions_df_llm_command_r_plus.csv'

if os.path.exists(test_save_path):
    print('The path exists!')
else:
    print(f'Saving to {test_save_path}...')
    test_data.to_csv(test_save_path, index=False)