In [8]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
import os
from openai import AzureOpenAI

from helper import new_azure_credentials, paths

In [9]:
client = AzureOpenAI(
    api_key= new_azure_credentials['api_key'],  
    api_version= new_azure_credentials['api_version'],
    azure_endpoint = new_azure_credentials['azure_endpoint']
    )

#This will correspond to the custom name you chose for your deployment when you deployed a model. Use a gpt-35-turbo-instruct deployment. 
datasets_path = paths['datasets_path']
json_datasets_path = paths['json_datasets_path']



In [4]:
def get_reduced_intensity(sentence, deployment_name):
    try:
        response = client.chat.completions.create(
        temperature = 0.5,
        model= deployment_name,
        messages=[
                {"role": "system", "content": "You are a linguist who specializes in rephrasing hate speech to convey the core message while reducing or eliminating its hateful intensity."}, 
                {"role": "user", "content": f"'{sentence}'"}
            ],
        max_tokens=1000,  # Increase this value to accommodate the full JSON response
        )
        print(response.choices[0].message.content)
        
        return response.choices[0].message.content

    ## gpt restrictions - cannot process hate speech
    except Exception as e:
        return 11.0

In [5]:
deployment_name = "gpt-35-short-system-sentence"

# Read the CSV file
df = pd.read_csv(datasets_path + 'rephrasal_of_sentences.csv')

In [6]:
# Apply the get_reduced_intensity function to the "Sentence" column
df['reduced_hateful_sentence'] = df['Sentence'].apply(lambda x: get_reduced_intensity(x, deployment_name))



'Islam looks like a cult more than like a religion. Some of its followers who misinterpret it commit crimes. They should not be accepted in our country.'
'We must send some people back to their native countries. They are too different from us.'
'Europe is civilised, Some Muslims who misinterpret Islam should not stay there.'
'If they love their interpretation of Sharia law so much, why do not they go off and live in a Muslim country?'
'Islam is invading us and trying to take over countries. It does not follow the scriptures that say 'you shall not envy anyone anything' Stop islam!'
'Finally a person who represents some westerners who hate Islam!'
'We are every day in constant danger, because some Muslims can attack us. For this fact we should put some Muslims in a hole in the ground.'
'Bad muslims who misinterpret Islam have nothing to do with western culture and as soon as the west will understand, these people will be defeated.'
'We are not allowed to celebrate Easter as a Christian 

In [7]:
# Save the new dataframe to a new CSV file
output_csv_path = json_datasets_path + 'sentences/hate_sentence_intensity_reduction/reduced_hateful_sentences.csv'
df.to_csv(output_csv_path, index=False)

print(f"Processed file saved as {output_csv_path}")

Processed file saved as ../../datasets/json_datasets/sentences/hate_sentence_intensity_reduction/reduced_hateful_sentences.csv


### Evaluation


In [16]:
import csv
from nltk.translate.bleu_score import sentence_bleu

def calculate_bleu(reference, candidate):
    reference = reference.split()  # assuming reference is a single sentence
    candidate = candidate.split()  # assuming candidate is a single sentence
    return sentence_bleu([reference], candidate)

def calculate_bleu_scores(csv_file, row1, row2):
    with open(csv_file, 'r', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        bleu_scores = []
        for row in reader:
            sentence = row[row1]
            rephrased_sentence = row[row2]
            bleu_score = calculate_bleu(sentence, rephrased_sentence)
            if bleu_score == 0.000000:
                # print(f"`{bleu_score} '{sentence}' and rephrased sentence: '{rephrased_sentence}'")
                pass
            else:
                bleu_scores.append(bleu_score)
    return bleu_scores

def calculate_average(scores):
    if not scores:
        return 0
    return sum(scores) / len(scores)

csv_file = json_datasets_path + 'sentences/hate_sentence_intensity_reduction/reduced_hateful_sentences.csv'


scores1 = calculate_bleu_scores(csv_file, 'Sentence', 'Normalized_Sentence')
average_score1 = calculate_average(scores1)
print(f"Average BLEU Score (excluding zeros): {100*average_score1}")


scores2 = calculate_bleu_scores(csv_file, 'Sentence', 'reduced_hateful_sentence')
average_score2 = calculate_average(scores2)
print(f"Average BLEU Score (excluding zeros): {100*average_score2}")

scores3 = calculate_bleu_scores(csv_file, 'Normalized_Sentence', 'reduced_hateful_sentence')
average_score3 = calculate_average(scores3)
print(f"Average BLEU Score (excluding zeros): {100*average_score3}")


Average BLEU Score (excluding zeros): 50.99139934172361
Average BLEU Score (excluding zeros): 51.2240333647002
Average BLEU Score (excluding zeros): 55.317460126275705


In [13]:
import pandas as pd

def average_columns_excluding_zeros_and_nulls(csv_file, col1, col2):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)

    # Exclude zeros and null values in both columns
    filtered_col1 = df[(df[col1] != 0) & (df[col1].notnull())][col1]
    filtered_col2 = df[(df[col2] != 0) & (df[col2].notnull())][col2]

    # Compute the averages
    average_col1 = filtered_col1.mean()
    average_col2 = filtered_col2.mean()

    return average_col1, average_col2

csv_file = json_datasets_path + 'sentences/hate_sentence_intensity_reduction/reduced_hateful_sentences.csv'  # Replace with your CSV file path
col1 = 'Normalized_Intensity'  # Column name for normalized intensity
col2 = 'Generated_New_Intensity'  # Column name for generated new intensity

average_col1, average_col2 = average_columns_excluding_zeros_and_nulls(csv_file, col1, col2)

print(f"Average of {col1} excluding 0 and null values: {average_col1}")
print(f"Average of {col2} excluding 0 and null values: {average_col2}")


KeyError: 'Generated_New_Intensity'