# Twitter News Scraper

This notebook demonstrates how to scrape tweets from Twitter usernames using the ntscraper library with robust error handling.

In [None]:
# Install required packages
!pip install ntscraper pandas

In [None]:
# Import required libraries
from ntscraper import Nitter
import pandas as pd
import time
import logging
from typing import List, Dict, Any

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [None]:
def scrape_twitter_users(usernames: List[str], tweets_per_user: int = 100, delay: int = 5) -> List[Dict[str, Any]]:
    """
    Scrape tweets from specified Twitter usernames with robust error handling.
    
    Args:
        usernames: List of Twitter usernames to scrape
        tweets_per_user: Number of tweets to fetch per user
        delay: Delay in seconds between requests
    
    Returns:
        List of tweet dictionaries
    """
    all_tweets = []
    
    # List of reliable Nitter instances (fallback options)
    nitter_instances = [
        "https://nitter.net",
        "https://nitter.privacydev.net",
        "https://nitter.unixfox.eu",
        "https://nitter.kavin.rocks"
    ]
    
    for username in usernames:
        print(f"\n🔍 Scraping tweets from user: {username}")
        
        # Try different instances if one fails
        success = False
        for i, instance in enumerate(nitter_instances):
            try:
                print(f"  Trying instance {i+1}/{len(nitter_instances)}: {instance}")
                
                # Initialize Nitter scraper
                scraper = Nitter(log_level=1, skip_instance_check=True)
                
                # Get tweets for the current user
                tweets = scraper.get_tweets(username, mode="user", number=tweets_per_user, instance=instance)
                
                if tweets and 'tweets' in tweets and tweets['tweets']:
                    all_tweets.extend(tweets['tweets'])
                    print(f"  ✅ Successfully scraped {len(tweets['tweets'])} tweets from {username}")
                    success = True
                    break
                else:
                    print(f"  ⚠️ No tweets found for {username} using {instance}")
                    
            except Exception as e:
                print(f"  ❌ Error with {instance}: {str(e)}")
                continue
        
        if not success:
            print(f"  💥 Failed to scrape tweets from {username} with all instances")
        
        # Add delay between requests to be respectful
        if username != usernames[-1]:  # Don't delay after the last user
            print(f"  ⏳ Waiting {delay} seconds before next request...")
            time.sleep(delay)
    
    return all_tweets

In [None]:
# Configure the usernames to scrape
usernames = [
    'alanhenney',
    'OCCRP', 
    'NOELreports',
    'CBSEveningNews',
    'CrimeChatt',
    'phivolcs_dost'
]

print(f"📋 Configured to scrape from {len(usernames)} users:")
for i, username in enumerate(usernames, 1):
    print(f"  {i}. @{username}")

In [None]:
# Run the scraper
print("🚀 Starting Twitter scraping process...\n")

try:
    # Scrape tweets
    all_tweets = scrape_twitter_users(usernames, tweets_per_user=100, delay=5)
    
    print(f"\n📊 Scraping completed! Total tweets collected: {len(all_tweets)}")
    
except Exception as e:
    print(f"💥 An error occurred during the scraping process: {str(e)}")
    all_tweets = []

In [None]:
# Process and save the data
if all_tweets:
    # Convert to DataFrame
    df = pd.DataFrame(all_tweets)
    
    print(f"📈 DataFrame created with shape: {df.shape}")
    print(f"📋 Available columns: {list(df.columns)}")
    
    # Display first few rows
    print("\n🔍 First 3 tweets:")
    display(df.head(3))
    
    # Save to CSV
    output_file = "dataset_crime_2.csv"
    df.to_csv(output_file, index=False)
    
    print(f"\n💾 Data saved to {output_file}")
    
    # Show some statistics
    print("\n📊 Data Statistics:")
    print(f"  • Total tweets: {len(df)}")
    if 'user' in df.columns:
        print(f"  • Unique users: {df['user'].nunique()}")
        print(f"  • Tweets per user:")
        user_counts = df['user'].value_counts()
        for user, count in user_counts.items():
            print(f"    - {user}: {count} tweets")
    
else:
    print("⚠️ No tweets were scraped successfully. Please check the usernames and try again.")

In [None]:
# Test scraping a single user (for debugging)
test_username = "CBSEveningNews"  # Change this to test different users

print(f"🧪 Testing single user scraping for: @{test_username}")

try:
    test_tweets = scrape_twitter_users([test_username], tweets_per_user=10, delay=2)
    
    if test_tweets:
        test_df = pd.DataFrame(test_tweets)
        print(f"✅ Test successful! Scraped {len(test_tweets)} tweets")
        print(f"📋 Sample tweet data:")
        if len(test_tweets) > 0:
            sample_tweet = test_tweets[0]
            for key, value in sample_tweet.items():
                print(f"  {key}: {str(value)[:100]}{'...' if len(str(value)) > 100 else ''}")
    else:
        print("❌ Test failed - no tweets scraped")
        
except Exception as e:
    print(f"❌ Test failed with error: {str(e)}")