In [2]:
pip install detoxify

Collecting detoxify
  Using cached detoxify-0.5.2-py3-none-any.whl.metadata (13 kB)
Using cached detoxify-0.5.2-py3-none-any.whl (12 kB)
Installing collected packages: detoxify
Successfully installed detoxify-0.5.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
from detoxify import Detoxify
import time

In [16]:
def analyze_toxicity(csv_file_path, save_results=True, sample_size=None):
    print(f"Loading data from {csv_file_path}...")
    # Load the CSV file
    df = pd.read_csv(csv_file_path)
    
    print(f"Analyzing all {len(df)} comments...")
 
    print("Loading Detoxify model...")
    model = Detoxify('original')
    
    # Process each comment
    results = []
    start_time = time.time()
    
    for i, row in df.iterrows():
        if i % 500 == 0 and i > 0:
            elapsed = time.time() - start_time
            comments_per_second = i / elapsed
            estimated_total = elapsed * (len(df) / i)
            print(f"Processed {i}/{len(df)} comments ({comments_per_second:.2f} comments/sec, estimated total time: {estimated_total/60:.1f} min)")
        
        comment = row['comment']
        if pd.isna(comment) or comment.strip() == '':
            continue
            
        try:
            scores = model.predict(comment)
            
            # Add original data
            for col in df.columns:
                scores[col] = row[col]
            results.append(scores)

        except Exception as e:
            print(f"Error processing comment {i}: {e}")
            print(f"Comment: {comment}")
   
    results_df = pd.DataFrame(results)
    
    results_df['primary_toxicity_type'] = results_df.apply(get_primary_toxicity, axis=1)
    
    # Calculate elapsed time
    elapsed_time = time.time() - start_time
    print(f"Analysis completed in {elapsed_time:.2f} seconds ({len(results)/elapsed_time:.2f} comments/sec)")
    
    if save_results:
        toxicity_cols = ['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack']
        output_cols = ['user', 'comment'] + ['primary_toxicity_type'] + toxicity_cols 
        output_file = csv_file_path.replace('.csv', '_with_toxicity.csv')
        results_df[output_cols].to_csv(output_file, index=False)
        print(f"Results saved to {output_file}")
    return results_df

In [18]:
def get_primary_toxicity(row):
    
    specific_toxicity_cols = ['severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack']
    
    max_specific_category = max(specific_toxicity_cols, key=lambda col: row[col])
    max_specific_score = row[max_specific_category]
    
    # Only assign a specific toxicity type if the score is above a threshold
    threshold = 0.5  
    if max_specific_score >= threshold:
        return max_specific_category.replace('_', ' ').title()
    # If no specific category is high enough but general toxicity is high
    elif row['toxicity'] >= threshold:
        return "General Toxicity"
    else:
        return "Non-toxic"

In [20]:
def get_toxicity_distribution(df):
    if 'primary_toxicity_type' not in df.columns:
        raise ValueError("DataFrame must contain 'primary_toxicity_type' column")
    
    toxicity_counts = df['primary_toxicity_type'].value_counts()
    
    total_comments = len(df)
    toxicity_percentages = (toxicity_counts / total_comments * 100).round(2)
    
    distribution_df = pd.DataFrame({
        'count': toxicity_counts,
        'percentage': toxicity_percentages
    })
    
    distribution_df = distribution_df.sort_values(by='count', ascending=False)
    
    # Display the results
    print("\n===== TOXICITY DISTRIBUTION =====")
    print(f"Total comments analyzed: {total_comments}")
    print("\nDistribution by toxicity type:")
    
    for toxicity_type, row in distribution_df.iterrows():
        print(f"{toxicity_type}: {row['count']} comments ({row['percentage']}%)")
    return distribution_df


In [14]:

csv_file_path = "HasanAbi.csv"
    
results = analyze_toxicity(csv_file_path, save_results=True, sample_size=None)

toxicity_distribution = get_toxicity_distribution(results)
    


Loading data from HasanAbi.csv...
Analyzing all 299 comments...
Loading Detoxify model...
Processed 100/299 comments (43.06 comments/sec, estimated total time: 0.1 min)
Processed 200/299 comments (43.45 comments/sec, estimated total time: 0.1 min)
Analysis completed in 6.93 seconds (43.14 comments/sec)
Results saved to HasanAbi_with_toxicity.csv

===== TOXICITY DISTRIBUTION =====
Total comments analyzed: 299

Distribution by toxicity type:
Non-toxic: 261.0 comments (87.29%)
General Toxicity: 22.0 comments (7.36%)
Obscene: 16.0 comments (5.35%)


In [2]:
csv_file_path = "HasanAbi-11478.csv"
    
results = analyze_toxicity(csv_file_path, save_results=True, sample_size=None)

toxicity_distribution = get_toxicity_distribution(results)

NameError: name 'analyze_toxicity' is not defined

In [22]:
csv_file_path = "zizarian.csv"
    
results = analyze_toxicity(csv_file_path, save_results=True, sample_size=None)

toxicity_distribution = get_toxicity_distribution(results)

Loading data from zizarian.csv...
Analyzing all 1744 comments...
Loading Detoxify model...
Processed 500/1744 comments (46.17 comments/sec, estimated total time: 0.6 min)
Processed 1000/1744 comments (45.87 comments/sec, estimated total time: 0.6 min)
Processed 1500/1744 comments (45.13 comments/sec, estimated total time: 0.6 min)
Analysis completed in 38.99 seconds (44.73 comments/sec)
Results saved to zizarian_with_toxicity.csv

===== TOXICITY DISTRIBUTION =====
Total comments analyzed: 1744

Distribution by toxicity type:
Non-toxic: 1602.0 comments (91.86%)
Obscene: 70.0 comments (4.01%)
General Toxicity: 64.0 comments (3.67%)
Insult: 6.0 comments (0.34%)
Threat: 1.0 comments (0.06%)
Identity Attack: 1.0 comments (0.06%)


In [22]:
csv_file_path = "loltyler.csv"
    
results = analyze_toxicity(csv_file_path, save_results=True, sample_size=None)

toxicity_distribution = get_toxicity_distribution(results)

Loading data from loltyler.csv...
Analyzing all 52359 comments...
Loading Detoxify model...
Processed 500/52359 comments (44.35 comments/sec, estimated total time: 19.7 min)
Processed 1000/52359 comments (42.78 comments/sec, estimated total time: 20.4 min)
Processed 1500/52359 comments (43.48 comments/sec, estimated total time: 20.1 min)
Processed 2000/52359 comments (44.26 comments/sec, estimated total time: 19.7 min)
Processed 2500/52359 comments (44.33 comments/sec, estimated total time: 19.7 min)
Processed 3000/52359 comments (44.73 comments/sec, estimated total time: 19.5 min)
Processed 3500/52359 comments (44.83 comments/sec, estimated total time: 19.5 min)
Processed 4000/52359 comments (44.78 comments/sec, estimated total time: 19.5 min)
Processed 4500/52359 comments (44.67 comments/sec, estimated total time: 19.5 min)
Processed 5000/52359 comments (45.05 comments/sec, estimated total time: 19.4 min)
Processed 5500/52359 comments (45.01 comments/sec, estimated total time: 19.4 m