## Run preprocessed Data through a Model that predicts the toxicity more detailed into: severe_toxicity, obscene, threat, insult, identity_attack, sexual_explicit <br>
## See if it seems to match with the plain toxicity classifiers prediction and do a more detailed analysis.

In [None]:
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from tqdm import tqdm 

In [2]:
# needs to match with the appropriate file name
PERCENTAGE = 0.01

file_path = f'..\\resources\\toxicity_analysis_{PERCENTAGE:.0%}_data.tsv'

try:
    toxicity_analysis_df = pd.read_csv(file_path, sep="\t")  
except Exception as e:
    print(f"Error loading file: {e}")

toxicity_analysis_df

Unnamed: 0,subreddit_category,word_count,preprocessed,toxicity_label
0,Advice,228,"i 'm 23 , moved back in with parents . how do ...",0
1,Advice,143,always feel an overwhelming lack of confidence...,0
2,Advice,233,"dealing with this break up hey there , this mo...",0
3,Advice,398,disagreement when picking rooms in the new hou...,0
4,Advice,226,i do n't know what i believe i do n't typicall...,0
...,...,...,...,...
1144,travel,213,"traveling , a few questions , ( planning a tri...",0
1145,weddingplanning,280,"wedding blanket ? i 'm a plus size bride , wit...",0
1146,weddingplanning,336,"36 days to go , & i ca n't find my english bir...",0
1147,weddingplanning,301,making your own wedding invitations ... is it ...,0


In [None]:
# Load the model and tokenizer
model_name = "unitary/toxic-bert"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the toxicity labels
toxicity_labels = [
    "severe_toxicity",
    "obscene",
    "threat",
    "insult",
    "identity_attack",
    "sexual_explicit",
]

# Function to predict toxicity for a batch of texts
def predict_toxicity_batch(texts):
    # Tokenize the input texts
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True)
    
    # Move inputs to GPU if available
    if torch.cuda.is_available():
        inputs = {key: val.to('cuda') for key, val in inputs.items()}
        model.to('cuda')  # Move model to GPU
    
    # Make predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Convert logits to probabilities
    probabilities = torch.sigmoid(logits).squeeze().cpu().tolist()  # Move back to CPU
    return probabilities

# Set the batch size
batch_size = 32  # Adjust this based on GPU memory

# Initialize a list to store the results
toxicity_results = []

# Process the DataFrame in batches
for i in tqdm(range(0, len(toxicity_analysis_df), batch_size), desc="Processing Toxicity Predictions"):
    batch_texts = toxicity_analysis_df['preprocessed'].iloc[i:i + batch_size].tolist()
    batch_results = predict_toxicity_batch(batch_texts)
    toxicity_results.extend(batch_results)

results_df = pd.DataFrame(toxicity_results, columns=toxicity_labels)
final_df = pd.concat([toxicity_analysis_df, results_df], axis=1)

In [None]:
file_path = '/content/preprocessed_df_detailed.tsv'

try:
    final_df.to_csv(file_path, index=False, sep="\t")
    print(f"File saved successfully: {file_path}")
except Exception as e:
    print(f"Error saving file: {e}")