In [None]:
import pandas as pd
from datetime import datetime
import re
from collections import Counter

def clean_duration(duration):
    if pd.isna(duration):
        return 'Unknown'
    duration = str(duration).lower()
    if 'permanent' in duration:
        return 'Permanent'
    elif 'temporary' in duration:
        return 'Temporary'
    return 'Unknown'

def analyze_moderation_cases(data):
    """
    Analyzes social media content moderation cases to identify patterns and trends

    Parameters:
    data (pd.DataFrame): DataFrame containing moderation case data

    Returns:
    dict: Analysis results including patterns and statistics
    """
    # Clean and prepare data
    df = data.copy()
    df['Duration'] = df['Duration'].apply(clean_duration)

    # Extract dates and convert to datetime
    df['Date'] = pd.to_datetime(df['Date'], format='%d %B %Y', errors='coerce')

    # Analysis results
    results = {
        'total_cases': len(df),
        'duration_breakdown': df['Duration'].value_counts().to_dict(),
        'cases_by_month': df.groupby(df['Date'].dt.to_period('M')).size().to_dict(),
        'common_reasons': [],
        'verified_accounts': 0,
        'political_figures': 0,
        'journalists': 0,
        'repeated_suspensions': []
    }

    # Analyze common reasons
    reason_keywords = {
        'covid': ['covid', 'vaccine'],
        'harassment': ['harass', 'conduct'],
        'misinformation': ['misinform', 'fake news'],
        'hate_speech': ['hate', 'hateful'],
        'impersonation': ['impersonat'],
        'spam': ['spam', 'platform manipulation'],
        'violence': ['violence', 'violent']
    }

    for category, keywords in reason_keywords.items():
        mask = df['Reason for suspension'].str.lower().str.contains('|'.join(keywords), na=False)
        count = mask.sum()
        if count > 0:
            results['common_reasons'].append({
                'category': category,
                'count': count,
                'percentage': (count / len(df)) * 100
            })

    # Identify patterns in account types
    account_patterns = {
        'verified': ['verified', 'official'],
        'political': ['politician', 'congressman', 'senator', 'representative'],
        'journalist': ['journalist', 'reporter', 'correspondent']
    }

    for description in df['Description'].dropna():
        desc_lower = description.lower()
        if any(word in desc_lower for word in account_patterns['verified']):
            results['verified_accounts'] += 1
        if any(word in desc_lower for word in account_patterns['political']):
            results['political_figures'] += 1
        if any(word in desc_lower for word in account_patterns['journalist']):
            results['journalists'] += 1

    # Find accounts with multiple suspensions
    account_counts = df['Individual/account'].value_counts()
    results['repeated_suspensions'] = account_counts[account_counts > 1].to_dict()

    return results

def generate_summary_report(results):
    """
    Generates a detailed summary report from the analysis results

    Parameters:
    results (dict): Analysis results from analyze_moderation_cases

    Returns:
    str: Formatted summary report
    """
    report = []
    report.append("Content Moderation Analysis Summary")
    report.append("=" * 35 + "\n")

    report.append(f"Total Cases Analyzed: {results['total_cases']}\n")

    report.append("Duration Breakdown:")
    for duration, count in results['duration_breakdown'].items():
        percentage = (count / results['total_cases']) * 100
        report.append(f"- {duration}: {count} cases ({percentage:.1f}%)")

    report.append("\nCommon Suspension Reasons:")
    for reason in sorted(results['common_reasons'], key=lambda x: x['count'], reverse=True):
        report.append(f"- {reason['category'].replace('_', ' ').title()}: "
                     f"{reason['count']} cases ({reason['percentage']:.1f}%)")

    report.append(f"\nAccount Type Analysis:")
    report.append(f"- Verified Accounts: {results['verified_accounts']}")
    report.append(f"- Political Figures: {results['political_figures']}")
    report.append(f"- Journalists: {results['journalists']}")

    if results['repeated_suspensions']:
        report.append("\nAccounts with Multiple Suspensions:")
        for account, count in results['repeated_suspensions'].items():
            report.append(f"- {account}: {count} suspensions")

    return "\n".join(report)

In [None]:
def main():
    file_path = '/content/Twitter_Suspensions.csv'

    try:
        data = pd.read_csv(file_path)
    except Exception as e:
        print(f"Error loading file: {e}")
        return

    data['Date'] = pd.to_datetime(data['Date'], format='%d %B %Y', errors='coerce')

    cutoff_date = datetime(2022, 10, 1)
    pre_october_2022 = data[data['Date'] < cutoff_date]
    post_october_2022 = data[data['Date'] >= cutoff_date]

    print("Analyzing moderation cases...\n")
    try:
        pre_results = analyze_moderation_cases(pre_october_2022)
        post_results = analyze_moderation_cases(post_october_2022)
    except Exception as e:
        print(f"Error during analysis: {e}")
        return

    print("Generating summary reports...\n")
    try:
        pre_report = generate_summary_report(pre_results)
        post_report = generate_summary_report(post_results)
    except Exception as e:
        print(f"Error generating reports: {e}")
        return

    print("Report for cases before October 2022:\n")
    print(pre_report)
    print("\n" + "=" * 50 + "\n")
    print("Report for cases from October 2022 onward:\n")
    print(post_report)

if __name__ == "__main__":
    main()


Analyzing moderation cases...

Generating summary reports...

Report for cases before October 2022:

Content Moderation Analysis Summary

Total Cases Analyzed: 254

Duration Breakdown:
- Unknown: 104 cases (40.9%)
- Permanent: 100 cases (39.4%)
- Temporary: 50 cases (19.7%)

Common Suspension Reasons:
- Harassment: 31 cases (12.2%)
- Hate Speech: 21 cases (8.3%)
- Covid: 19 cases (7.5%)
- Spam: 16 cases (6.3%)
- Violence: 16 cases (6.3%)
- Misinformation: 15 cases (5.9%)
- Impersonation: 10 cases (3.9%)

Account Type Analysis:
- Verified Accounts: 0
- Political Figures: 11
- Journalists: 4

Accounts with Multiple Suspensions:
- Marjorie Taylor
Greene
(@mtgreenee): 4 suspensions
- Donald Trump
(@realDonaldTrump): 2 suspensions
- Charlie Kirk
(@charliekirk11): 2 suspensions
- Libs of TikTok
(@libsoftiktok): 2 suspensions
- Lee Hurst
(@LeeHurstComic): 2 suspensions


Report for cases from October 2022 onward:

Content Moderation Analysis Summary

Total Cases Analyzed: 30

Duration Breakdo