In [4]:
# !ollama pull llama3
# !ollama pull llama3:70b

import ollama
import pandas as pd
from tqdm import tqdm
import ast

from sklearn.metrics import classification_report, precision_score, f1_score

import sys
sys.path.append('src/')
from models.classification_methods import get_classification_report

random_seed = 42

raw_data_path = 'data/raw/'
processed_data_path = 'data/processed/'
results_cr_path = 'reports/classification_reports/'
test_results_path = 'reports/test_results/'
reports_path = 'reports/'

target_list = ['ig','bo', 'cl', 'co', 'gl', 'lu']

estimator_name = 'llama3'
#estimator_name = 'llama3:70b'
prompt_name = 'prompt2'



with open(f'src/models/config/prompts/{prompt_name}.txt', 'r') as file:
    
    prompt_template = file.read() 

data_list = []

for target in target_list:
    
    # read data
    data_temp = pd.read_csv(
        raw_data_path + f'r3_{target}_test_users.csv', 
        sep = ';', 
        encoding='utf-8-sig'
        )
    
    data_temp['target'] = target
    
    data_list.append(data_temp)
    
data_users = pd.concat(data_list)

dict_cp = {
    'cl':'Hidroxicloroquina',
    'lu':'Lula',
    'co':'Sinovac',
    'ig':'Church',
    'gl':'Globo TV',
    'bo':'Bolsonaro',
}

def get_response_from_llm(prompt):
    response_full = ollama.generate(model=estimator_name, prompt = prompt)
    return response_full


def format_response(
    response,
    threshold = 0.5
    ):
    
    message = response['response']
        
    try:
        # string dict to dict
        response = eval(message)
        
        if response < threshold:
            y_pred = 0
        else:
            y_pred = 1
            
    except Exception as e:
        y_pred = None 
        
    return message, y_pred

# def format_response(response):
    
#     message = response["message"]["content"]
    
#     print(message)
    
#     try:
#         # string dict to dict
#         response = ast.literal_eval(message)
#         y_pred = response['classification'].casefold()
        
#         if y_pred == "against":
#             y_pred = 0
#         elif y_pred == "for":
#             y_pred = 1
#         else: y_pred = None
            
#     except Exception as e:
#         y_pred = None 
        
#     return message, y_pred


dict_responses = {}

list_results = [] 

list_df_responses = [] 

for target in target_list:

    df_responses = pd.DataFrame({
        "idx":[],
        "text":[],
        "target":[],
        "y_test":[],
        "y_pred":[],
        "justification":[],
        "complete_response": []
    })
    
    

    data = data_users[data_users['target'] == target]

    text_col = 'Stance'
    
    cr_path = f"{reports_path}classification_reports/{estimator_name}_{target}_{text_col}_classification_report.csv"
    
    for idx, row in tqdm(data.iterrows(), total = len(data), desc = target):
        
        text = row['Stance']
        target_id = target
        target = dict_cp.get(row['target'])
        polarity = row["Polarity"]
        polarity = 1 if polarity == 'for' else 0
        
        prompt_formated = prompt_template.format(
        target = target,
        text = text)
        
        response_full = get_response_from_llm(prompt_formated)
        
        message, y_pred = format_response(response_full)

        new_row = {
        "idx": idx,
        "text":text,
        "target":target,
        "y_test": polarity,
        "y_pred":y_pred,
        "complete_response": message
        
        }
        
        df_responses.loc[len(df_responses)] = new_row
        
    df_responses['target'] = target
    
    list_df_responses.append(df_responses)
    
df_results_final = pd.concat(list_df_responses)   

df_results_final.to_csv(f'{reports_path}test_results/{estimator_name}_{prompt_name}_{text_col}_classification_report.csv')

ig:   2%|▏         | 11/599 [00:05<04:33,  2.15it/s]


KeyboardInterrupt: 

In [None]:
df_results_final['score'] = df_results_final.complete_response.apply(eval)
df_results_final.head()

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

for target in df_results_final.target.unique():
    
    df = df_results_final[df_results_final.target == target]
    
    print(target)
    
    print(classification_report(df.y_test,df.y_pred))

    sns.kdeplot(data=df, x = "score", hue="y_test")
    
    plt.show()

In [None]:
df

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score



# Faixa de thresholds para testar
thresholds = np.linspace(0, 1, num=100)

# Listas para armazenar métricas
accuracy_scores = []
f1_scores = []
precision_scores = []
recall_scores = []

# Testar cada threshold
for threshold in thresholds:
    predicted_classes = (df['score'] >= threshold).astype(int)
    
    # Calcular métricas
    accuracy_scores.append(accuracy_score(df['y_test'], predicted_classes))
    f1_scores.append(f1_score(df['y_test'], predicted_classes))
    precision_scores.append(precision_score(df['y_test'], predicted_classes))
    recall_scores.append(recall_score(df['y_test'], predicted_classes))

# Visualizar as métricas em relação ao threshold
plt.figure(figsize=(10, 6))
plt.plot(thresholds, accuracy_scores, label='Acurácia')
plt.plot(thresholds, f1_scores, label='F1-Score')
plt.plot(thresholds, precision_scores, label='Precisão')
plt.plot(thresholds, recall_scores, label='Recall')
plt.xlabel('Threshold')
plt.ylabel('Métrica')
plt.title('Métricas por Threshold')
plt.legend()
plt.show()

# Melhor threshold para cada métrica
best_threshold_accuracy = thresholds[np.argmax(accuracy_scores)]
best_threshold_f1 = thresholds[np.argmax(f1_scores)]
best_threshold_precision = thresholds[np.argmax(precision_scores)]
best_threshold_recall = thresholds[np.argmax(recall_scores)]

print("Melhor threshold para acurácia:", best_threshold_accuracy)
print("Melhor threshold para F1-Score:", best_threshold_f1)
print("Melhor threshold para precisão:", best_threshold_precision)
print("Melhor threshold para recall:", best_threshold_recall)


In [None]:
df_results_final['y_pred_new'] = df_results_final.score.apply(lambda x: 1 if x>0.7 else 0)

In [None]:
for target in df_results_final.target.unique():
    
    df = df_results_final[df_results_final.target == target]
    
    print(target)
    
    print(classification_report(df.y_test,df.y_pred_new))

