In [1]:
from pyspark.sql import SparkSession

In [2]:
# We create a spark session and use the spark context within the spark session because we want both dataframes and rdds

ss = SparkSession.builder.appName("DataProc").config("spark.driver.memory", "32g").config("spark.executor.memory", "32g").config("spark.executor.memoryOverhead", "32g").getOrCreate()
sc = ss.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/03 19:35:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# we load up an rdd from the paths text file
paths_rdd = sc.textFile("data/wet.paths")

In [4]:
# we take approximately sample_size paths from the samples rdd. path contains like 30k crawls so we don't want huge samples
sample_size = 1
sampled_paths = paths_rdd.sample(withReplacement=False, fraction=sample_size/paths_rdd.count(), seed=0)

                                                                                

In [5]:
# this is just something we need to prepend to every path so we can create it into a link to download
url_head = "https://data.commoncrawl.org/"

In [6]:
# now we have an rdd of just urls
urls_rdd = sampled_paths.map(lambda x: url_head + x)

In [7]:
urls_rdd.take(1)

['https://data.commoncrawl.org/crawl-data/CC-MAIN-2025-47/segments/1762439344161.47/wet/CC-MAIN-20251115170955-20251115200955-00990.warc.wet.gz']

In [8]:
# we use requests to get a http response from the common crawl server
def get_request(url):
    import requests
    response = requests.get(url, stream=True)
    if response.status_code == 200:
    # Request was successful
        return response
    else:
        return None

In [9]:
responses_rdd = urls_rdd.map(get_request)

In [10]:
# we convert the responses into gzip files in memory
def get_zipped(response):
    import io
    return io.BytesIO(response.content)

In [11]:
zipped_files = responses_rdd.map(get_zipped)

In [12]:
zipped_files.take(1)

                                                                                

[<_io.BytesIO at 0x14e959a71810>]

In [13]:
# we unzip the zipped files also within memory
def unzip(zipped):
    import gzip
    with gzip.GzipFile(fileobj=zipped) as decompressed_file:
        return decompressed_file.read()

In [14]:
unzipped_rdd = zipped_files.map(unzip)

In [15]:
type(unzipped_rdd.take(1)[0])

                                                                                

bytes

In [16]:
# we convert the unzipped bytes into usable strings 
str_rdd = unzipped_rdd.map(lambda x: x.decode("utf-8"))

In [19]:
# as you can see this is a plaintext snippet of the plaintext WET file. we need to process this string rdd to do our analysis.
# we need to strip away the description data and access just the text. ideally we get rid of line breaks and stuff like that too.
# then throw it into language detection, lda, and so on
print(str_rdd.take(1)[0][:10000])

                                                                                

WARC/1.0
WARC-Type: warcinfo
WARC-Date: 2025-11-21T04:06:14Z
WARC-Filename: CC-MAIN-20251115170955-20251115200955-00990.warc.wet.gz
WARC-Record-ID: <urn:uuid:ab28ab8c-7598-4b9c-8476-fa456e97b558>
Content-Type: application/warc-fields
Content-Length: 372

Software-Info: ia-web-commons.3.0.1-SNAPSHOT-20251103031243
Extracted-Date: Fri, 21 Nov 2025 04:06:14 GMT
robots: checked via crawler-commons 1.6-SNAPSHOT (https://github.com/crawler-commons/crawler-commons)
isPartOf: CC-MAIN-2025-47
operator: Common Crawl Admin (info@commoncrawl.org)
description: Wide crawl of the web for November 2025
publisher: Common Crawl



WARC/1.0
WARC-Type: conversion
WARC-Target-URI: http://0555st.cn/Pro/59.html
WARC-Date: 2025-11-15T19:07:00Z
WARC-Record-ID: <urn:uuid:89fc2355-4e3f-4f31-8c9f-fa9fdad37c7c>
WARC-Refers-To: <urn:uuid:cb89bb7e-86bf-4c5b-b989-425af8e33cec>
WARC-Block-Digest: sha1:RJKU4E3P7RJB25I436A3SOX25NNZS5D5
WARC-Identified-Content-Language: zho
Content-Type: text/pl