In [41]:
!pip install warcio pandas
!pip install newspaper3k lxml[html_clean]



In [42]:
import requests
import gzip
import io
import warcio
import pandas as pd
import tqdm # Import tqdm
import re # Import re
from newspaper import Article # Import Article

# Get September 2024 WARC paths list
url = "https://data.commoncrawl.org/crawl-data/CC-NEWS/2024/09/warc.paths.gz"

# Fetch the file
response = requests.get(url, stream=True)
response.raise_for_status()  # Raise an exception for bad responses (4xx or 5xx)

# Decompress and print content
with gzip.GzipFile(fileobj=io.BytesIO(response.content)) as gz:
    for line in gz:
        print(line.decode('utf-8').strip()) # Print each line, removing leading/trailing spaces

crawl-data/CC-NEWS/2024/09/CC-NEWS-20240901011006-05863.warc.gz
crawl-data/CC-NEWS/2024/09/CC-NEWS-20240901040313-05864.warc.gz
crawl-data/CC-NEWS/2024/09/CC-NEWS-20240901061815-05865.warc.gz
crawl-data/CC-NEWS/2024/09/CC-NEWS-20240901082058-05866.warc.gz
crawl-data/CC-NEWS/2024/09/CC-NEWS-20240901100849-05867.warc.gz
crawl-data/CC-NEWS/2024/09/CC-NEWS-20240901115147-05868.warc.gz
crawl-data/CC-NEWS/2024/09/CC-NEWS-20240901132402-05869.warc.gz
crawl-data/CC-NEWS/2024/09/CC-NEWS-20240901145127-05870.warc.gz
crawl-data/CC-NEWS/2024/09/CC-NEWS-20240901161309-05871.warc.gz
crawl-data/CC-NEWS/2024/09/CC-NEWS-20240901174029-05872.warc.gz
crawl-data/CC-NEWS/2024/09/CC-NEWS-20240901191613-05873.warc.gz
crawl-data/CC-NEWS/2024/09/CC-NEWS-20240901210159-05874.warc.gz
crawl-data/CC-NEWS/2024/09/CC-NEWS-20240901230057-05875.warc.gz
crawl-data/CC-NEWS/2024/09/CC-NEWS-20240902013240-05876.warc.gz
crawl-data/CC-NEWS/2024/09/CC-NEWS-20240902035909-05877.warc.gz
crawl-data/CC-NEWS/2024/09/CC-NEWS-20240

In [43]:
# Specify WARC file path
warc_file_url = "https://data.commoncrawl.org/crawl-data/CC-NEWS/2024/09/CC-NEWS-20240923074837-06216.warc.gz"

# Use HEAD request to get headers only
response = requests.get(warc_file_url, stream=True)
response.raise_for_status()

# Get file size from headers and print
file_size = int(response.headers.get('content-length', 0))
print(f"File size: {file_size} bytes")
print(f"File size: {file_size / (1024 * 1024):.2f} MB")  # Convert to MB

File size: 1072759398 bytes
File size: 1023.06 MB


In [44]:
# Function to process the WARC file and extract data
def process_warc_file(warc_file_url):
    data = []
    with requests.get(warc_file_url, stream=True) as response:
        response.raise_for_status()

        # Process WARC records
        for record in warcio.ArchiveIterator(response.raw):
            if record.rec_type == 'response':

                # Extract URL, date and content length
                url = record.rec_headers.get_header('WARC-Target-URI')
                date = record.rec_headers.get_header('WARC-Date')
                content_length = record.rec_headers.get_header('Content-Length')

                # Store extracted info in 'data' list
                data.append([url, date, content_length])

    # Create DataFrame
    df = pd.DataFrame(data, columns=['URL', 'Date', 'Content-Length'])
    return df

# Process the WARC file and get the DataFrame
df = process_warc_file(warc_file_url)
df

Unnamed: 0,URL,Date,Content-Length
0,https://www.indiatvnews.com/sports/cricket/roh...,2024-09-23T07:48:37Z,218429
1,https://ilsaronno.it/2024/09/23/la-maternita-u...,2024-09-23T07:48:37Z,271976
2,https://www.dailymail.co.uk/wires/reuters/arti...,2024-09-23T07:48:37Z,527487
3,https://aspicts.substack.com/p/tech-giants-pus...,2024-09-23T07:48:37Z,214833
4,https://topick.hket.com/article/3831279/%E5%B0...,2024-09-23T07:48:37Z,137772
...,...,...,...
25493,https://webcatalog.io/pt/apps/tag/api-generati...,2024-09-23T09:13:39Z,128452
25494,https://www.parkiet.com/technologie/art4117331...,2024-09-23T09:13:39Z,506580
25495,https://www.sussexexpress.co.uk/sport/boxing/c...,2024-09-23T09:13:39Z,269726
25496,https://www.etvbharat.com/mr/!health-and-lifes...,2024-09-23T09:13:40Z,360289


In [45]:
# Function to process the WARC file and extract data
def process_warc_file(warc_file_url):
    data = []
    with requests.get(warc_file_url, stream=True) as response:
        response.raise_for_status()

        # Process WARC records
        for record in warcio.ArchiveIterator(io.BytesIO(response.content)): # Pass the response content as bytes
            if record.rec_type == 'response':

                # Extract URL, date and content length
                url = record.rec_headers.get_header('WARC-Target-URI')
                date = record.rec_headers.get_header('WARC-Date')
                content_length = record.rec_headers.get_header('Content-Length')

                # Store extracted info in 'data' list
                data.append([url, date, content_length])

    # Create DataFrame
    df = pd.DataFrame(data, columns=['URL', 'Date', 'Content-Length'])
    return df

# Process the WARC file and get the DataFrame
df = process_warc_file(warc_file_url)
df

Unnamed: 0,URL,Date,Content-Length
0,https://www.indiatvnews.com/sports/cricket/roh...,2024-09-23T07:48:37Z,218429
1,https://ilsaronno.it/2024/09/23/la-maternita-u...,2024-09-23T07:48:37Z,271976
2,https://www.dailymail.co.uk/wires/reuters/arti...,2024-09-23T07:48:37Z,527487
3,https://aspicts.substack.com/p/tech-giants-pus...,2024-09-23T07:48:37Z,214833
4,https://topick.hket.com/article/3831279/%E5%B0...,2024-09-23T07:48:37Z,137772
...,...,...,...
25493,https://webcatalog.io/pt/apps/tag/api-generati...,2024-09-23T09:13:39Z,128452
25494,https://www.parkiet.com/technologie/art4117331...,2024-09-23T09:13:39Z,506580
25495,https://www.sussexexpress.co.uk/sport/boxing/c...,2024-09-23T09:13:39Z,269726
25496,https://www.etvbharat.com/mr/!health-and-lifes...,2024-09-23T09:13:40Z,360289


In [46]:
# Helper function to process WARC file
def process_warc_file(warc_file_url, limit=1000):
    data = []
    count = 0

    with requests.get(warc_file_url, stream=True) as response:
        response.raise_for_status()

        # Wrap the iterator with tqdm to process records with a progress bar up to the limit
        for record in tqdm.tqdm(warcio.ArchiveIterator(response.raw), total=limit, desc="Processing records"):
            if record.rec_type == 'response':

                # Proceed with data extraction and filtering
                url = record.rec_headers.get_header('WARC-Target-URI')
                date = record.rec_headers.get_header('WARC-Date')
                content_length = record.rec_headers.get_header('Content-Length')

                try:
                    html_content = record.content_stream().read().decode('utf-8', 'ignore')

                    # Check for lang="en" before <head> (handling line breaks)
                    if re.search(r'lang\s*=\s*[\'"]?en[\'"]?[\s\S]*?<head>', html_content, re.IGNORECASE):

                        # Extract title and article content using newspaper3k
                        article = Article(url, language='en')
                        article.download(input_html=html_content)
                        article.parse()
                        title = article.title
                        news_article = article.text

                        # Filter news article texts containing "netflix" (case-insensitive)
                        if news_article and re.search(r'netflix', news_article, re.IGNORECASE):
                            data.append([url, date, content_length, title, news_article])

                # Error handling
                except UnicodeDecodeError as e:
                    print(f"Error decoding HTML content from {url}: {e}")
                except Exception as e:
                    print(f"Error extracting article from {url}: {e}")

            # Increment count and check limit after processing each record
            count += 1
            if count > limit:
                break # Exit the loop if the limit is reached

    # Create DataFrame
    df = pd.DataFrame(data, columns=['URL', 'Date', 'Content-Length', 'Title', 'News_Article'])
    return df

In [47]:
# Record the start time
start_time = time.time()

# Process the first 5000 English records with "netflix" in the title
df = process_warc_file(warc_file_url, limit=5000)

# Calculate and print total processing time
end_time = time.time()
processing_time = end_time - start_time
print(f"\nTotal processing time: {processing_time:.2f} seconds")

# Display DataFrame
df

Processing records: 100%|██████████| 5000/5000 [01:35<00:00, 52.43it/s]


Total processing time: 95.67 seconds





Unnamed: 0,URL,Date,Content-Length,Title,News_Article
0,https://www.indiatvnews.com/technology/news/ip...,2024-09-23T07:48:43Z,225423,"iPhone, Mac and iPad must be updated soon, as ...","Follow us on Image Source : FILE iPhone, Mac a..."
1,https://www.express.co.uk/news/royal/1951925/p...,2024-09-23T07:53:11Z,201548,Prince Harry and Meghan Markle 'feeling wrath ...,The pair were absent from a post-Emmy party th...
