In [0]:
from pyspark.sql.functions import col
import re

In [0]:
# Read the Simple English Wikipedia data
wikiData = spark.read.text("/FileStore/tables/simplewiki_latest_pages_articles__xml.bz2")

In [0]:
# Extract links from the text function
def extract_links(text):
    link_pattern = r'\[\[(.+?)([|\]]|$)'
    return re.findall(link_pattern, text)

# Process the data using Spark RDD
links = wikiData.rdd.flatMap(lambda x: extract_links(x[0])) \
    .map(lambda x: (x[0].split('|')[0], 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .map(lambda x: (x[0], x[1]))

# Convert RDD to DataFrame
linksDF = links.toDF(["page", "out_links"])

# Initialize PageRank
initialPageRanks = linksDF.withColumn("rank", col("out_links") * 0.15)

# Iteratively calculate PageRank
for _ in range(10):
    currentRanks = initialPageRanks.select("page", "rank")
    contributions = linksDF.join(currentRanks, "page") \
        .withColumn("contribution", col("rank") / col("out_links")) \
        .groupBy("page") \
        .sum("contribution") \
        .withColumnRenamed("sum(contribution)", "contribution")

    newPageRanks = linksDF.join(contributions, "page", "left_outer") \
        .fillna(0, subset=["contribution"]) \
        .withColumn("rank", col("contribution") * 0.85 + 0.15)
    initialPageRanks = newPageRanks

# Display top 10 PageRanks
initialPageRanks.orderBy(col("rank").desc()).show(10)

+--------------------+---------+------------------+------------------+
|                page|out_links|      contribution|              rank|
+--------------------+---------+------------------+------------------+
| ################...|        1|0.8031255956592774|0.8326567563103858|
|         Julpe River|        1|0.8031255956592774|0.8326567563103858|
|      Bath, Somerset|        1|0.8031255956592774|0.8326567563103858|
| code from templates|        1|0.8031255956592774|0.8326567563103858|
|   Cessna Citation I|        1|0.8031255956592774|0.8326567563103858|
|        Kurt Widmann|        1|0.8031255956592774|0.8326567563103858|
| Image:Mountain_o...|        1|0.8031255956592774|0.8326567563103858|
| Air Force Commen...|        1|0.8031255956592774|0.8326567563103858|
|     This templat...|        1|0.8031255956592774|0.8326567563103858|
|       Lisa Grabner |        1|0.8031255956592774|0.8326567563103858|
+--------------------+---------+------------------+------------------+
only s

In [0]:
from pyspark.sql.functions import col
import re

In [0]:
# Read the Simple English Wikipedia data
wikiData = spark.read.text("/FileStore/tables/simplewiki_latest_pages_articles__xml.bz2")

# Sample data: (URL, neighbors)
links = sc.parallelize([
    ('A', ['B', 'C']),
    ('B', ['C']),
    ('C', ['A']),
    ('D', ['C'])
]).partitionBy(4).persist()

# Initialize each URL's rank to 1.0
ranks = links.mapValues(lambda neighbors: 1.0)

# Number of iterations
for i in range(10):
    # Calculates URL contributions to the rank of other URLs
    contributions = links.join(ranks).flatMap(
        lambda url_urls_rank: 
            [(dest, url_urls_rank[1][1] / len(url_urls_rank[1][0])) for dest in url_urls_rank[1][0]]
    )

    # Update the ranks based on contributions
    ranks = contributions.reduceByKey(lambda a, b: a + b).mapValues(lambda rank: 0.15 + 0.85 * rank)

# Collect and print the final ranks
print(ranks.collect())



[('B', 0.6801633626023985), ('A', 1.2528717283018032), ('C', 1.26383931343652)]
