## Run preprocessed Data through a Model that predicts the identity: 
## See if we can make a more detailed analysis.

In [None]:
#!pip install transformers tqdm

import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from tqdm import tqdm 

In [None]:
file_path = '/content/preprocessed_df_1%_data.tsv'

try:
    toxicity_analysis_df = pd.read_csv(file_path, sep="\t")  
except Exception as e:
    print(f"Error loading file: {e}")

In [None]:
toxicity_analysis_df

In [None]:
# Load a pre-trained model and tokenizer
model_name = "distilbert-base-uncased"  # You can choose another model as well
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=8)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define your identity labels
identity_labels = [
    "male",
    "female",
    "homosexual_gay_or_lesbian",
    "christian",
    "jewish",
    "muslim",
    "black",
    "white",
    "psychiatric_or_mental_illness"
]

# Function to predict identities for a batch of texts
def predict_identities_batch(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    
    # Move inputs to GPU if available
    if torch.cuda.is_available():
        inputs = {key: val.to('cuda') for key, val in inputs.items()}
        model.to('cuda')

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Apply sigmoid to get probabilities
    probabilities = torch.sigmoid(logits).tolist()
    return probabilities

# Set the batch size
batch_size = 32  # Adjust based on GPU memory

# Initialize a list to store the results
identity_results = []

# Process the DataFrame in batches
for i in tqdm(range(0, len(toxicity_analysis_df), batch_size), desc="Processing Identity Predictions"):
    batch_texts = toxicity_analysis_df['preprocessed'].iloc[i:i + batch_size].tolist()
    batch_results = predict_identities_batch(batch_texts)
    identity_results.extend(batch_results)

results_df = pd.DataFrame(identity_results, columns=identity_labels)
final_df = pd.concat([toxicity_analysis_df, results_df], axis=1)