### In this notebook we use LLama 3 70B and gpt4o for annotation

In [1]:
from rich import print
import json
from tqdm import tqdm
from dotenv import load_dotenv, find_dotenv
import os
from openai import OpenAI
import pandas as pd
from tqdm import tqdm

In [2]:
_ = load_dotenv(find_dotenv())
api_key = os.environ['OPENAI_API_KEY']
# base_url = 'https://api.endpoints.anyscale.com/v1'
# base_url = 'https://api.aimlapi.com'
base_url = 'https://integrate.api.nvidia.com/v1'
# base_url = os.environ['OPENAI_BASE_URL']
client = OpenAI(base_url=base_url)

In [3]:
pwd

'/notebooks/ABSA-for-Open-Ended-Qs-in-Education-Surveys/notebooks/suggestions_notebooks'

In [4]:
example1 = "المحتوى أكثر من رائع"

In [5]:
system_message = """
You are a helpful assistant designed for data annotation.
"""

In [6]:
user_template = """
I am a data scientist working on a classification problem. The dataset includes responses to open-ended questions from a university survey about education.

Your task is to assign a label to the example I provide.

The possible labels are:

    1. "needs to be added": when the example suggests adding something that is currently missing.
    2. "needs enhancements": when the example mentions something that exists but in low quality, suggesting improvements.
    3. "needs to be removed": when the example refers to something existing but suggests its removal.
    4. "none": otherwise.

Here is the example:

{}

Please provide only the label as the output.
output:
"""

In [7]:
def format_user_message(template, text):
    return template.format(text)


In [8]:
def format_one_example(review):
    return {
        "messages": [
            {
                "role": "system",
                "content": system_message
            },
            {
                "role": "user",
                "content": format_user_message(user_template, review)
            }
        ]
    }

In [9]:
print(format_one_example(example1)['messages'][1]['content'])

In [10]:
# model_name = 'meta-llama/Meta-Llama-3-70B-Instruct'
# model_name = 'meta-llama/Meta-Llama-3-70B-Instruct'
# model_name = 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo'
# model_name = 'llama3-70b'
# model_name = 'gpt-4o'
model_name = 'meta/llama3-70b-instruct'
def get_completion(review, model_name):
    completion = client.chat.completions.create(
        model=model_name,
        temperature=0,
        messages=format_one_example(review)['messages']
    )
    
    return completion.choices[0].message.content

In [11]:
def predict_gpt(item, model_name=model_name):
    label = get_completion(item, model_name)
    # result = json.loads(completion)
    # pred_labels = [result.get(token, 'O') for token in item['tokens']]
    # pred_labels = [v for k, v in result.items()]
    # organizations = result['Organizations']
    return label

In [12]:
example1 = "المحتوى أكثر من رائع"

In [13]:
predict_gpt(example1)

'none'

In [14]:
train_data_path = '../../data/result/suggestions_train.csv'
test_data_path = '../../data/result/suggestions_test.csv'
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [15]:
test_data.head()

Unnamed: 0,column,text,topic_id,topic_name
0,add_suggestions,اقتراحاتي للإضافة: everything is spectacular a...,2,
1,add_suggestions,اقتراحاتي للإضافة: thank you for everything,2,
2,add_suggestions,اقتراحاتي للإضافة: أتمنى ان يتم تطبيقها بشكل فعال,3,موازنة الجزء العملي مع الجزء النظري
3,add_suggestions,اقتراحاتي للإضافة: ان يتم طليق العملي بشكل مكثف,3,موازنة الجزء العملي مع الجزء النظري
4,add_suggestions,اقتراحاتي للإضافة: change the contents all,0,محتوى ومعلومات المقرر


In [16]:
tqdm.pandas()

# print('Annotate train data...')
# train_data['suggestion_label'] = train_data['text'].progress_apply(predict_gpt)
print('Annotate test data...')
test_data['suggestion_label'] = test_data['text'].progress_apply(predict_gpt)

100%|██████████| 219/219 [00:35<00:00,  6.14it/s]


In [17]:
# for e in train_data['text'].sample(50):
#     # print(e)
#     print('topic:\n', predict_gpt(e))
#     print('----'*25)
    

In [18]:
# train_data['sentiment'] = train_data['sentiment'].progress_apply(str.capitalize)
# test_data['sentiment'] = test_data['sentiment'].progress_apply(str.capitalize)

In [19]:
test_data.head()

Unnamed: 0,column,text,topic_id,topic_name,suggestion_label
0,add_suggestions,اقتراحاتي للإضافة: everything is spectacular a...,2,,1
1,add_suggestions,اقتراحاتي للإضافة: thank you for everything,2,,1
2,add_suggestions,اقتراحاتي للإضافة: أتمنى ان يتم تطبيقها بشكل فعال,3,موازنة الجزء العملي مع الجزء النظري,1
3,add_suggestions,اقتراحاتي للإضافة: ان يتم طليق العملي بشكل مكثف,3,موازنة الجزء العملي مع الجزء النظري,1
4,add_suggestions,اقتراحاتي للإضافة: change the contents all,0,محتوى ومعلومات المقرر,1


In [20]:
len(test_data['suggestion_label'].value_counts().index)

5

In [21]:
test_data['suggestion_label'].value_counts()

1       137
2        71
4         4
none      4
3         3
Name: suggestion_label, dtype: int64

In [22]:
labels_dict ={"1": "needs to be added", "2": "needs enhancements", "3": "needs to be removed" , "4": "none", "none": "none"}

In [23]:
test_data['suggestion_label'] = test_data['suggestion_label'].apply(lambda x: labels_dict[x])

In [24]:
test_data['suggestion_label'].value_counts()

needs to be added      137
needs enhancements      71
none                     8
needs to be removed      3
Name: suggestion_label, dtype: int64

In [25]:
test_data['suggestion_label'].value_counts() / len(train_data)

needs to be added      0.421538
needs enhancements     0.218462
none                   0.024615
needs to be removed    0.009231
Name: suggestion_label, dtype: float64

In [26]:
# train_save_path = '../../data/result/train_with_suggestion_df_llm_llama3_70b.csv'
test_save_path = '../../data/result/test_with_suggestion_df_llm_llama3_70b.csv'

# if os.path.exists(train_save_path):
#     print('The path exists!')
# else:
#     print('Saving...')
#     train_data.to_csv(train_save_path, index=False)
    

if os.path.exists(test_save_path):
    print('The path exists!')
else:
    print('Saving...')
    test_data.to_csv(test_save_path, index=False)