In [1]:
# Project: Detecting Offensive Text Using NLP Techniques
# Subject: CS 59000-05 Natural Language Processing
# Author: 
# Qurratul Ain Quais : quaiqa@pfw.edu


In [2]:
# Required installations:
# pip install pandas scikit-learn matplotlib seaborn wordcloud spacy
# python -m spacy download en_core_web_sm


In [3]:
# Imports
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, precision_recall_curve, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
import re
import string
from collections import Counter
import spacy
from wordcloud import WordCloud



In [6]:
# Define labels globally
label_cols = ['toxicity', 'obscene', 'threat', 
              'insult', 'identity_attack', 'sexual_explicit']

# Load dataset
def load_and_prepare_data():
    print("Loading dataset...")
    ds = load_dataset("google/civil_comments")
    
    # Convert to pandas and prepare
    train = ds["train"].to_pandas()
    test = ds["test"].to_pandas()
    
    # Add unique ID to test data
    test = test.reset_index(drop=True)
    test['id'] = test.index
    
    # Handle empty comments
    train['text'] = train['text'].fillna("unknown")
    test['text'] = test['text'].fillna("unknown")
    
    return train, test

# Tokenization
re_tok = re.compile(f'([{string.punctuation}""¨«»®´·º½¾¿¡§£₤''])')
def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()

# Feature extraction
def extract_features(train_df, test_df, text_column='text'):
    print("Extracting features...")
    vec = TfidfVectorizer(
        ngram_range=(1,2),
        tokenizer=tokenize,
        min_df=3,
        max_df=0.9,
        strip_accents='unicode',
        use_idf=True,
        smooth_idf=True,
        sublinear_tf=True
    )
    
    train_features = vec.fit_transform(train_df[text_column])
    test_features = vec.transform(test_df[text_column])
    
    return train_features, test_features, vec

# Model training utilities
def pr(y_i, y, x):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

def train_model(x, y):
    """Train logistic regression model with Naive Bayes features"""
    y_binary = (y.values >= 0.5).astype(int)
    r = np.log(pr(1, y_binary, x) / pr(0, y_binary, x))
    m = LogisticRegression(C=4, max_iter=200)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y_binary), r

def calculate_metrics(predictions_df, test_df):
    """Calculate comprehensive metrics including Precision, Recall, F1-score, and Accuracy"""
    metrics_results = {}
    
    # Ensure predictions and test data are aligned by id
    predictions_df = predictions_df.set_index('id')
    test_df = test_df.set_index('id')
    
    # Verify that IDs match
    if not predictions_df.index.equals(test_df.index):
        raise ValueError("Prediction and test set IDs don't match!")
    
    for col in label_cols:
        # Convert predictions to binary using 0.5 threshold
        y_pred_binary = (predictions_df[col] >= 0.5).astype(int)
        y_true = (test_df[col] >= 0.5).astype(int)
        
        # Calculate confusion matrix
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred_binary).ravel()
        
        # Calculate metrics manually to avoid classification_report issues
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        
        metrics_results[col] = {
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'accuracy': accuracy,
            'auc': roc_auc_score(y_true, predictions_df[col]) if len(np.unique(y_true)) > 1 else 0,
            'confusion_matrix': {
                'tn': tn,
                'fp': fp,
                'fn': fn,
                'tp': tp
            }
        }
        
        # Print metrics for current label
        print(f"\nMetrics for {col}:")
        print(f"Precision: {metrics_results[col]['precision']:.4f}")
        print(f"Recall: {metrics_results[col]['recall']:.4f}")
        print(f"F1-score: {metrics_results[col]['f1_score']:.4f}")
        print(f"Accuracy: {metrics_results[col]['accuracy']:.4f}")
        print(f"AUC: {metrics_results[col]['auc']:.4f}")
        
        # Create confusion matrix visualization
        cm = np.array([[tn, fp], [fn, tp]])
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix for {col}')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.savefig(f'confusion_matrix_{col}.png')
        plt.close()
    
    return metrics_results

def visualize_all_metrics(metrics_results):
    """Create a comprehensive visualization of all metrics"""
    metrics_to_plot = ['precision', 'recall', 'f1_score', 'accuracy', 'auc']
    n_metrics = len(metrics_to_plot)
    
    plt.figure(figsize=(15, 10))
    width = 0.15
    x = np.arange(len(label_cols))
    
    for i, metric in enumerate(metrics_to_plot):
        values = [metrics_results[col][metric] for col in label_cols]
        plt.bar(x + (i - n_metrics/2 + 0.5) * width, values, width, label=metric.replace('_', ' ').title())
    
    plt.xlabel('Labels')
    plt.ylabel('Score')
    plt.title('All Metrics Comparison Across Labels')
    plt.xticks(x, label_cols, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.savefig('all_metrics_comparison.png')
    plt.close()



def calculate_auc_scores(predictions_df, test_df):
    """Calculate AUC scores using aligned IDs."""
    # Ensure predictions and test data are aligned by id
    predictions_df = predictions_df.set_index('id')
    test_df = test_df.set_index('id')
    
    # Verify that IDs match
    if not predictions_df.index.equals(test_df.index):
        raise ValueError("Prediction and test set IDs don't match!")
    
    auc_scores = {}
    
    for col in label_cols:
        # Get predictions and true labels, ensuring they're aligned by ID
        y_pred = predictions_df[col]
        y_true = (test_df[col] >= 0.5).astype(int)
        
        if len(np.unique(y_true)) > 1:
            auc = roc_auc_score(y_true, y_pred)
            auc_scores[col] = auc
            print(f"{col}: {auc:.4f}")
    
    mean_auc = np.mean(list(auc_scores.values()))
    print(f"\nMean ROC AUC Score: {mean_auc:.4f}")
    
    return mean_auc, auc_scores

def visualize_auc_scores(column_scores, mean_auc):
    """Visualize AUC scores for each label"""
    plt.figure(figsize=(10, 6))
    valid_labels = list(column_scores.keys())
    scores = [column_scores[label] for label in valid_labels]

    plt.bar(valid_labels, scores)
    plt.axhline(y=mean_auc, color='r', linestyle='--', label=f'Mean AUC: {mean_auc:.4f}')
    plt.xticks(rotation=45)
    plt.ylabel('AUC Score')
    plt.title('ROC AUC Scores by Label')
    plt.legend()
    plt.tight_layout()
    plt.savefig('auc_scores.png')
    plt.close()

def analyze_label_distribution(train_df, test_df):
    """Analyze and visualize label distribution"""
    print("\nAnalyzing label distribution...")
    plt.figure(figsize=(12, 6))
    
    train_stats = {col: (train_df[col] >= 0.5).mean() * 100 for col in label_cols}
    test_stats = {col: (test_df[col] >= 0.5).mean() * 100 for col in label_cols}
    
    x = np.arange(len(label_cols))
    width = 0.35
    
    plt.bar(x - width/2, train_stats.values(), width, label='Train')
    plt.bar(x + width/2, test_stats.values(), width, label='Test')
    
    plt.xlabel('Labels')
    plt.ylabel('Percentage of Positive Cases')
    plt.title('Distribution of Labels in Train and Test Sets')
    plt.xticks(x, label_cols, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.savefig('label_distribution.png')
    plt.close()
    
    return train_stats, test_stats

def create_wordcloud(text_series, title):
    """Create and save wordcloud visualization"""
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(text_series))
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.tight_layout(pad=0)
    plt.savefig(f'{title.lower().replace(" ", "_")}.png')
    plt.close()

def create_wordcloud_by_label(df, label, title):
    """Create wordcloud for specific label"""
    print(f"Creating wordcloud for {label}...")
    
    # Filter comments where the label value is >= 0.5 (toxic)
    toxic_comments = df[df[label] >= 0.5]['text']
    
    if len(toxic_comments) == 0:
        print(f"No comments found for {label}")
        return
        
    text = ' '.join(toxic_comments)
    wordcloud = WordCloud(width=800, height=400, 
                         background_color='white',
                         max_words=200).generate(text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.tight_layout(pad=0)
    plt.savefig(f'wordcloud_{label}.png')
    plt.close()

def analyze_correlations(predictions_df):
    """Analyze and visualize correlations between predictions"""
    print("\nAnalyzing label correlations...")
    corr_matrix = predictions_df[label_cols].corr()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Label Correlation Matrix')
    plt.tight_layout()
    plt.savefig('correlation_matrix.png')
    plt.close()
    
    return corr_matrix

def main():
    # Load and prepare data
    train, test = load_and_prepare_data()
    
    # Extract features
    train_features, test_features, vectorizer = extract_features(train, test)
    
    # Train models and make predictions
    print("Training models and making predictions...")
    predictions = np.zeros((len(test), len(label_cols)))
    for i, col in enumerate(label_cols):
        print(f'Training model for {col}...')
        model, r = train_model(train_features, train[col])
        predictions[:,i] = model.predict_proba(test_features.multiply(r))[:,1]
    
    # Create submission DataFrame
    predictions_df = pd.DataFrame(predictions, columns=label_cols)
    predictions_df['id'] = test['id']
    predictions_df.to_csv('submission.csv', index=False)
    
    # Calculate all metrics
    print("\nCalculating comprehensive metrics...")
    metrics_results = calculate_metrics(predictions_df, test)
    
    # Visualize all metrics
    visualize_all_metrics(metrics_results)
    
    # Previous visualizations and analysis
    train_stats, test_stats = analyze_label_distribution(train, test)
    
    # Create word clouds
    toxic_mask = predictions_df['toxicity'] >= 0.5
    create_wordcloud(test[toxic_mask]['text'], 'Toxic Comments WordCloud')
    create_wordcloud(test[~toxic_mask]['text'], 'Non-Toxic Comments WordCloud')

    print("\nCreating word clouds for specific toxicity types...")
    for label in label_cols:
        create_wordcloud_by_label(train, label, f'WordCloud for {label} Comments')
    
    # Analyze correlations
    correlation_matrix = analyze_correlations(predictions_df)
    
    # Save detailed results including new metrics
    results = {
        'metrics_by_label': metrics_results,
        'train_distribution': train_stats,
        'test_distribution': test_stats,
        'correlation_matrix': correlation_matrix.to_dict()
    }
    
    # Save results to file
    with open('analysis_results.txt', 'w') as f:
        f.write("ANALYSIS RESULTS\n")
        f.write("=" * 50 + "\n\n")
        for key, value in results.items():
            f.write(f"{key.upper()}:\n")
            f.write(str(value))
            f.write("\n" + "=" * 50 + "\n")
    
    print("\nAnalysis complete! Check the generated files for visualizations and detailed results.")

if __name__ == "__main__":
    main()

Loading dataset...
Extracting features...




Training models and making predictions...
Training model for toxicity...
Training model for obscene...
Training model for threat...
Training model for insult...
Training model for identity_attack...
Training model for sexual_explicit...

Calculating comprehensive metrics...

Metrics for toxicity:
Precision: 0.7676
Recall: 0.5267
F1-score: 0.6247
Accuracy: 0.9494
AUC: 0.9533

Metrics for obscene:
Precision: 0.6962
Recall: 0.4346
F1-score: 0.5351
Accuracy: 0.9958
AUC: 0.9836

Metrics for threat:
Precision: 0.3500
Recall: 0.1267
F1-score: 0.1860
Accuracy: 0.9975
AUC: 0.9651

Metrics for insult:
Precision: 0.7657
Recall: 0.5608
F1-score: 0.6474
Accuracy: 0.9637
AUC: 0.9663

Metrics for identity_attack:
Precision: 0.4430
Recall: 0.2038
F1-score: 0.2792
Accuracy: 0.9926
AUC: 0.9745

Metrics for sexual_explicit:
Precision: 0.6000
Recall: 0.2625
F1-score: 0.3652
Accuracy: 0.9977
AUC: 0.9882

Analyzing label distribution...

Creating word clouds for specific toxicity types...
Creating wordcloud