In [1]:
# Project: Detecting Offensive Text Using NLP Techniques
# Subject: CS 59000-05 Natural Language Processing
# Author: 
# Qurratul Ain Quais : quaiqa@pfw.edu

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from wordcloud import WordCloud

In [3]:
# Define label columns
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

def load_and_prepare_data():
    """Load and prepare the training and testing data"""
    print("Loading dataset...")
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    test_labels = pd.read_csv('test_labels.csv')
    
    # Create 'none' label for comments with no toxicity labels
    train['none'] = 1 - train[label_cols].max(axis=1)
    
    # Handle missing values
    train['comment_text'] = train['comment_text'].fillna("unknown")
    test['comment_text'] = test['comment_text'].fillna("unknown")
    
    return train, test, test_labels

def analyze_text_examples(train_df):
    """Print example comments"""
    print("\nExample Comments:")
    print('Example 1:\n{}\n'.format(train_df['comment_text'][0]))
    print('Example 2:\n{}\n'.format(train_df['comment_text'][159570]))

def analyze_comment_lengths(df, title='Comment Length Distribution'):
    """Analyze and visualize comment lengths"""
    print("\nAnalyzing comment lengths...")
    lens = df.comment_text.str.len()
    mean, std, max_len = lens.mean(), lens.std(), lens.max()
    print(f"Mean length: {mean:.2f}")
    print(f"Standard deviation: {std:.2f}")
    print(f"Max length: {max_len}")
    
    plt.figure(figsize=(10, 6))
    lens.hist(bins=50)
    plt.title(title)
    plt.xlabel('Comment Length (characters)')
    plt.ylabel('Frequency')
    plt.savefig('comment_lengths.png')
    plt.close()
    
    return {'mean': mean, 'std': std, 'max': max_len}

def analyze_label_distribution(train_df):
    """Analyze the distribution of labels"""
    print("\nAnalyzing label distribution...")
    # Calculate label frequencies
    label_counts = train_df[label_cols].sum()
    total_comments = len(train_df)
    
    # Create distribution plot
    plt.figure(figsize=(10, 6))
    label_counts.plot(kind='bar')
    plt.title('Distribution of Toxic Labels')
    plt.xlabel('Label Type')
    plt.ylabel('Number of Comments')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('label_distribution.png')
    plt.close()
    
    # Calculate and print percentages
    print("\nLabel Distribution:")
    for label in label_cols:
        percentage = (train_df[label].sum() / total_comments) * 100
        print(f"{label}: {percentage:.2f}%")
    
    return label_counts.to_dict()

def tokenize(s): 
    """Tokenize text"""
    re_tok = re.compile(f'([{string.punctuation}""¨«»®´·º½¾¿¡§£₤''])')
    return re_tok.sub(r' \1 ', s).split()

def extract_features(train_df, test_df):
    """Extract TF-IDF features"""
    print("\nExtracting features...")
    vec = TfidfVectorizer(
        ngram_range=(1,2),
        tokenizer=tokenize,
        min_df=3,
        max_df=0.9,
        strip_accents='unicode',
        use_idf=True,
        smooth_idf=True,
        sublinear_tf=True
    )
    
    train_features = vec.fit_transform(train_df['comment_text'])
    test_features = vec.transform(test_df['comment_text'])
    
    print(f"Training features shape: {train_features.shape}")
    print(f"Testing features shape: {test_features.shape}")
    
    return train_features, test_features, vec

def create_general_wordclouds(df, label_cols):
    """Create word clouds for overall toxic and non-toxic comments"""
    print("\nCreating general toxic/non-toxic word clouds...")
    
    # A comment is toxic if any toxicity label is 1
    toxic_mask = df[label_cols].max(axis=1) == 1
    
    # Create toxic wordcloud
    toxic_text = ' '.join(df[toxic_mask]['comment_text'])
    toxic_cloud = WordCloud(width=800, height=400, 
                          background_color='white',
                          max_words=200).generate(toxic_text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(toxic_cloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Overall Toxic Comments WordCloud')
    plt.tight_layout(pad=0)
    plt.savefig('overall_toxic_wordcloud.png')
    plt.close()
    
    # Create non-toxic wordcloud
    non_toxic_text = ' '.join(df[~toxic_mask]['comment_text'])
    non_toxic_cloud = WordCloud(width=800, height=400, 
                               background_color='white',
                               max_words=200).generate(non_toxic_text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(non_toxic_cloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Overall Non-Toxic Comments WordCloud')
    plt.tight_layout(pad=0)
    plt.savefig('overall_non_toxic_wordcloud.png')
    plt.close()

def create_wordcloud_by_label(df, label, title):
    """Create wordcloud for specific label"""
    text = ' '.join(df[df[label] == 1]['comment_text'])
    wordcloud = WordCloud(width=800, height=400, 
                         background_color='white',
                         max_words=200).generate(text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.tight_layout(pad=0)
    plt.savefig(f'wordcloud_{label}.png')
    plt.close()

def analyze_correlations(df):
    """Analyze correlations between labels"""
    print("\nAnalyzing label correlations...")
    corr_matrix = df[label_cols].corr()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Label Correlation Matrix')
    plt.tight_layout()
    plt.savefig('correlation_matrix.png')
    plt.close()
    
    return corr_matrix

def calculate_metrics(predictions_df, test_labels):
    """Calculate comprehensive metrics including Precision, Recall, F1-score, and Accuracy"""
    metrics_results = {}
    
    # Lists to store metrics for mean calculation
    all_metrics = {
        'precision': [],
        'recall': [],
        'f1_score': [],
        'accuracy': [],
        'auc': []
    }
    
    for col in label_cols:
        # Convert predictions to binary using 0.5 threshold
        y_pred_binary = (predictions_df[col] >= 0.5).astype(int)
        y_true = test_labels[col].values
        
        # Filter out -1 labels if they exist
        valid_indices = y_true != -1
        y_pred_binary = y_pred_binary[valid_indices]
        y_true = y_true[valid_indices]
        
        # Calculate confusion matrix
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred_binary).ravel()
        
        # Calculate metrics
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        auc = roc_auc_score(y_true, predictions_df[col][valid_indices]) if len(np.unique(y_true)) > 1 else 0
        
        # Store metrics for mean calculation
        all_metrics['precision'].append(precision)
        all_metrics['recall'].append(recall)
        all_metrics['f1_score'].append(f1)
        all_metrics['accuracy'].append(accuracy)
        all_metrics['auc'].append(auc)
        
        metrics_results[col] = {
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'accuracy': accuracy,
            'auc': auc,
            'confusion_matrix': {
                'tn': tn,
                'fp': fp,
                'fn': fn,
                'tp': tp
            }
        }
        
        # Print metrics for current label
        print(f"\nMetrics for {col}:")
        print(f"Precision: {metrics_results[col]['precision']:.4f}")
        print(f"Recall: {metrics_results[col]['recall']:.4f}")
        print(f"F1-score: {metrics_results[col]['f1_score']:.4f}")
        print(f"Accuracy: {metrics_results[col]['accuracy']:.4f}")
        print(f"AUC: {metrics_results[col]['auc']:.4f}")
        
        # Create confusion matrix visualization
        cm = np.array([[tn, fp], [fn, tp]])
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix for {col}')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.savefig(f'confusion_matrix_{col}.png')
        plt.close()
    
    # Calculate mean metrics
    mean_metrics = {metric: np.mean(values) for metric, values in all_metrics.items()}
    
    # Print mean metrics
    print("\nMean Metrics Across All Labels:")
    for metric, value in mean_metrics.items():
        print(f"Mean {metric}: {value:.4f}")
    
    # Add mean metrics to results
    metrics_results['mean_metrics'] = mean_metrics
    
    return metrics_results

def visualize_all_metrics(metrics_results):
    """Create a comprehensive visualization of all metrics"""
    metrics_to_plot = ['precision', 'recall', 'f1_score', 'accuracy', 'auc']
    n_metrics = len(metrics_to_plot)
    
    # Create main metrics comparison plot
    plt.figure(figsize=(15, 10))
    width = 0.15
    x = np.arange(len(label_cols))
    
    for i, metric in enumerate(metrics_to_plot):
        values = [metrics_results[col][metric] for col in label_cols]
        bars = plt.bar(x + (i - n_metrics/2 + 0.5) * width, values, width, 
                      label=metric.replace('_', ' ').title())
        
        # Add mean line for each metric
        mean_value = metrics_results['mean_metrics'][metric]
        plt.axhline(y=mean_value, color=bars.patches[0].get_facecolor(), 
                   linestyle='--', alpha=0.5)
    
    plt.xlabel('Labels')
    plt.ylabel('Score')
    plt.title('All Metrics Comparison Across Labels (with Means)')
    plt.xticks(x, label_cols, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.savefig('all_metrics_comparison.png')
    plt.close()
    
    # Create mean metrics summary plot
    plt.figure(figsize=(10, 6))
    mean_values = [metrics_results['mean_metrics'][metric] for metric in metrics_to_plot]
    
    bars = plt.bar(metrics_to_plot, mean_values)
    plt.title('Mean Metrics Across All Labels')
    plt.xlabel('Metric')
    plt.ylabel('Mean Score')
    plt.xticks(rotation=45)
    
    # Add value labels on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.4f}',
                ha='center', va='bottom')
    
    plt.tight_layout()
    plt.savefig('mean_metrics_summary.png')
    plt.close()

def save_metrics_summary(metrics_results):
    """Save a detailed summary of all metrics to a file"""
    with open('metrics_summary.txt', 'w') as f:
        f.write("METRICS SUMMARY REPORT\n")
        f.write("=" * 50 + "\n\n")
        
        # Write individual label metrics
        for col in label_cols:
            f.write(f"\nMetrics for {col}:\n")
            f.write("-" * 30 + "\n")
            for metric, value in metrics_results[col].items():
                if metric != 'confusion_matrix':
                    f.write(f"{metric}: {value:.4f}\n")
            f.write("\nConfusion Matrix:\n")
            cm = metrics_results[col]['confusion_matrix']
            f.write(f"TN: {cm['tn']}, FP: {cm['fp']}\n")
            f.write(f"FN: {cm['fn']}, TP: {cm['tp']}\n")
        
        # Write mean metrics
        f.write("\n\nMEAN METRICS ACROSS ALL LABELS\n")
        f.write("-" * 30 + "\n")
        for metric, value in metrics_results['mean_metrics'].items():
            f.write(f"Mean {metric}: {value:.4f}\n")

def calculate_mean_column_wise_auc(predictions_df, labels_df):
    """Calculate AUC scores for each label"""
    print("\nCalculating AUC scores...")
    auc_scores = {}
    
    for column in label_cols:
        y_pred = predictions_df[column].values
        y_true = labels_df[column].values
        
        # Filter out -1 labels
        valid_indices = y_true != -1
        y_pred_filtered = y_pred[valid_indices]
        y_true_filtered = y_true[valid_indices]
        
        if len(np.unique(y_true_filtered)) > 1:
            auc = roc_auc_score(y_true_filtered, y_pred_filtered)
            auc_scores[column] = auc
            print(f"{column}: {auc:.4f}")
    
    mean_auc = np.mean(list(auc_scores.values()))
    print(f"\nMean ROC AUC Score: {mean_auc:.4f}")
    
    # Visualize AUC scores
    plt.figure(figsize=(10, 6))
    plt.bar(auc_scores.keys(), auc_scores.values())
    plt.axhline(y=mean_auc, color='r', linestyle='--', 
                label=f'Mean AUC: {mean_auc:.4f}')
    plt.xticks(rotation=45)
    plt.ylabel('AUC Score')
    plt.title('ROC AUC Scores by Label')
    plt.legend()
    plt.tight_layout()
    plt.savefig('auc_scores.png')
    plt.close()
    
    return mean_auc, auc_scores

def pr(y_i, y, x):
    """Calculate prediction ratio with features matrix"""
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

def train_model(train_features, y):
    """Train model with Naive Bayes features"""
    r = np.log(pr(1, y, train_features) / pr(0, y, train_features))
    m = LogisticRegression(C=4)
    x_nb = train_features.multiply(r)
    return m.fit(x_nb, y), r

def main():
    # Load and prepare data
    train, test, test_labels = load_and_prepare_data()
    
    # Analyze text examples
    analyze_text_examples(train)
    
    # Analyze comment lengths
    length_stats = analyze_comment_lengths(train)
    
    # Analyze label distribution
    label_stats = analyze_label_distribution(train)

    # Create general toxic/non-toxic word clouds
    create_general_wordclouds(train, label_cols)
    
    # Create wordclouds for each label
    for label in label_cols:
        create_wordcloud_by_label(train, label, f'WordCloud for {label} Comments')
    
    # Analyze label correlations
    correlation_matrix = analyze_correlations(train)
    
    # Extract features
    train_features, test_features, vectorizer = extract_features(train, test)
    
    # Train models and make predictions
    print("\nTraining models and making predictions...")
    predictions = np.zeros((len(test), len(label_cols)))
    
    for i, col in enumerate(label_cols):
        print(f'Training model for {col}...')
        y = train[col].values
        model, r = train_model(train_features, y)
        predictions[:,i] = model.predict_proba(test_features.multiply(r))[:,1]
    
    # Create submission
    submission = pd.DataFrame(predictions, columns=label_cols)
    submission['id'] = test['id']
    submission.to_csv('submission.csv', index=False)
    
    # Calculate and visualize AUC scores
    mean_auc, column_scores = calculate_mean_column_wise_auc(submission, test_labels)

    print("\nCalculating comprehensive metrics...")
    metrics_results = calculate_metrics(submission, test_labels)
    
    # Create visualizations for all metrics
    visualize_all_metrics(metrics_results)
    
    # Save detailed metrics summary
    save_metrics_summary(metrics_results)
    
    # Save all results
    results = {
        'length_statistics': length_stats,
        'label_distribution': label_stats,
        'auc_scores': column_scores,
        'mean_auc': mean_auc,
        'correlation_matrix': correlation_matrix.to_dict()
    }
    
    with open('analysis_results.txt', 'w') as f:
        f.write("ANALYSIS RESULTS\n")
        f.write("=" * 50 + "\n\n")
        for key, value in results.items():
            f.write(f"{key.upper()}:\n")
            f.write(str(value))
            f.write("\n" + "=" * 50 + "\n")
    
    print("\nAnalysis complete! Check the generated files for visualizations and detailed results.")

if __name__ == "__main__":
    main()

Loading dataset...

Example Comments:
Example 1:
Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27

Example 2:
"
And ... I really don't think you understand.  I came here and my idea was bad right away.  What kind of community goes ""you have bad ideas"" go away, instead of helping rewrite them.   "


Analyzing comment lengths...
Mean length: 394.07
Standard deviation: 590.72
Max length: 5000

Analyzing label distribution...

Label Distribution:
toxic: 9.58%
severe_toxic: 1.00%
obscene: 5.29%
threat: 0.30%
insult: 4.94%
identity_hate: 0.88%

Creating general toxic/non-toxic word clouds...

Analyzing label correlations...

Extracting features...




Training features shape: (159571, 425935)
Testing features shape: (153164, 425935)

Training models and making predictions...
Training model for toxic...
Training model for severe_toxic...
Training model for obscene...
Training model for threat...
Training model for insult...
Training model for identity_hate...

Calculating AUC scores...
toxic: 0.9664
severe_toxic: 0.9817
obscene: 0.9754
threat: 0.9915
insult: 0.9699
identity_hate: 0.9757

Mean ROC AUC Score: 0.9768

Calculating comprehensive metrics...

Metrics for toxic:
Precision: 0.6148
Recall: 0.7823
F1-score: 0.6885
Accuracy: 0.9326
AUC: 0.9664

Metrics for severe_toxic:
Precision: 0.3491
Recall: 0.3815
F1-score: 0.3646
Accuracy: 0.9924
AUC: 0.9817

Metrics for obscene:
Precision: 0.7199
Recall: 0.6852
F1-score: 0.7021
Accuracy: 0.9665
AUC: 0.9754

Metrics for threat:
Precision: 0.6015
Recall: 0.3791
F1-score: 0.4651
Accuracy: 0.9971
AUC: 0.9915

Metrics for insult:
Precision: 0.7366
Recall: 0.5515
F1-score: 0.6307
Accuracy: 0.96