In [3]:
# toxicity analysis using DistilBERT
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [4]:

df = pd.read_csv('zizaran_day_1_full_1700.csv')
print(f"Loaded {len(df)} comments")

Loaded 1744 comments


In [5]:

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

In [6]:

def check_toxicity(text):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return 0.0
    
    # Tokenize and convert to model inputs
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    
    # Get prediction
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get probabilities (0 = negative/toxic, 1 = positive/non-toxic)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1).numpy()[0]
    
    # For sentiment models: 1 - positive score = toxicity score
    toxicity_score = 1 - probs[1]
    
    return toxicity_score


In [11]:

print("\nTesting on a few examples:")
sample_comments = df['comment_text'].dropna().sample(5).tolist()
for comment in sample_comments:
    toxicity = check_toxicity(comment)
    print(f"Comment: {comment}")
    print(f"Toxicity score: {toxicity:.4f}")
    print("-" * 50)


Testing on a few examples:
Comment: I wouldn't show that to my kid :O
Toxicity score: 0.9821
--------------------------------------------------
Comment: how many builds are really viable in Ruthless? except cold dot... which almost everybody seems to be going for :D
Toxicity score: 0.9959
--------------------------------------------------
Comment: too much sunk cost and ppl getting possesive imo
Toxicity score: 0.9995
--------------------------------------------------
Comment: I hope we get a expedition / heist level league
Toxicity score: 0.9812
--------------------------------------------------
Comment: @jisuo Depends on where in sweden you are.. northern sweden yes. southern sweden not so much
Toxicity score: 0.9983
--------------------------------------------------


In [33]:

sample_size = 100
sample_df = df.head(sample_size).copy()

# Process each comment
print(f"\nAnalyzing {sample_size} comments...")
results = []
for i, comment in enumerate(sample_df['comment_text']):
    if i % 10 == 0:  # Progress update
        print(f"Processing comment {i+1}/{sample_size}")
    
    toxicity = check_toxicity(comment)
    results.append(toxicity)

# Add results to dataframe
sample_df['toxicity_score'] = results
sample_df['DistilBERT'] = (sample_df['toxicity_score'] > 0.5).map({True: 'yes', False: 'no'})


Analyzing 100 comments...
Processing comment 1/100
Processing comment 11/100
Processing comment 21/100
Processing comment 31/100
Processing comment 41/100
Processing comment 51/100
Processing comment 61/100
Processing comment 71/100
Processing comment 81/100
Processing comment 91/100


In [35]:

print("\nResults:")
print(f"Total comments analyzed: {len(sample_df)}")
print(f"Comments detected as toxic: {(sample_df['DistilBERT'] == 'yes').sum()} ({(sample_df['DistilBERT'] == 'yes').mean()*100:.1f}%)")


Results:
Total comments analyzed: 100
Comments detected as toxic: 77 (77.0%)


In [37]:

print("\nMost toxic comments:")
most_toxic = sample_df.sort_values('toxicity_score', ascending=False).head(5)
for i, row in most_toxic.iterrows():
    print(f"Comment: {row['comment_text']}")
    print(f"Toxicity score: {row['toxicity_score']:.4f}")
    print("-" * 50)


Most toxic comments:
Comment: He is just too bad to even enjoy this mode
Toxicity score: 0.9998
--------------------------------------------------
Comment: This chat in particular has the worst chatters Madge
Toxicity score: 0.9998
--------------------------------------------------
Comment: i wanna play poe so bad but i feel like its pointless before 3.20 since theres so many good changes Sadeg
Toxicity score: 0.9998
--------------------------------------------------
Comment: this gamemode is not for me peepoRage and thus it should not exist peepoRage
Toxicity score: 0.9997
--------------------------------------------------
Comment: any reason why everyone is just playing the silly broken 1 button rf/cold builds when this is supposed to be hard mode XD
Toxicity score: 0.9996
--------------------------------------------------


In [39]:

output_file = 'toxicity_results.csv'
sample_df.to_csv(output_file, index=False)
print(f"\nResults saved to {output_file}")


Results saved to toxicity_results.csv


In [41]:

print("\nAnalyzing all comments... (this may take a while)")
all_results = []
for i, comment in enumerate(df['comment_text']):
    if i % 500 == 0:  # Progress update
        print(f"Processing comment {i+1}/{len(df)}")
    
    toxicity = check_toxicity(comment)
    all_results.append(toxicity)

df['toxicity_score'] = all_results
df['DistilBERT'] = (df['toxicity_score'] > 0.5).map({True: 'yes', False: 'no'})
df.to_csv('zizarian.csv', index=False)
print("Full analysis complete!")


Analyzing all comments... (this may take a while)
Processing comment 1/1744
Processing comment 501/1744
Processing comment 1001/1744
Processing comment 1501/1744
Full analysis complete!
