In [2]:
import os
import sys
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as t

In [3]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.12:0.17.0 pyspark-shell'

In [4]:
spark = SparkSession.builder.getOrCreate()
spark

In [5]:
# Загружаем все посты
all_posts = spark.read.format("com.databricks.spark.xml").option("rowTag", "row").option("timestampFormat", "y/M/d H:m:s").load("posts_sample.xml")

In [6]:
print("Схема--------------------------------------------------------------------")
all_posts.printSchema()
print("Количество элементов-----------------------------------------------------")
print(all_posts.count())
print("Первый элемент-----------------------------------------------------------")
all_posts.show(1)

Схема--------------------------------------------------------------------
root
 |-- _AcceptedAnswerId: long (nullable = true)
 |-- _AnswerCount: long (nullable = true)
 |-- _Body: string (nullable = true)
 |-- _ClosedDate: timestamp (nullable = true)
 |-- _CommentCount: long (nullable = true)
 |-- _CommunityOwnedDate: timestamp (nullable = true)
 |-- _CreationDate: timestamp (nullable = true)
 |-- _FavoriteCount: long (nullable = true)
 |-- _Id: long (nullable = true)
 |-- _LastActivityDate: timestamp (nullable = true)
 |-- _LastEditDate: timestamp (nullable = true)
 |-- _LastEditorDisplayName: string (nullable = true)
 |-- _LastEditorUserId: long (nullable = true)
 |-- _OwnerDisplayName: string (nullable = true)
 |-- _OwnerUserId: long (nullable = true)
 |-- _ParentId: long (nullable = true)
 |-- _PostTypeId: long (nullable = true)
 |-- _Score: long (nullable = true)
 |-- _Tags: string (nullable = true)
 |-- _Title: string (nullable = true)
 |-- _ViewCount: long (nullable = true)

Кол

In [7]:
# Оставляем посты созданные в нужный промежуток времени
dates = ("2010-01-01", "2020-12-31")
posts = all_posts.where(all_posts._CreationDate.between(*dates))

In [8]:
print("Количество элементов-----------------------------------------------------")
print(posts.count())

Количество элементов-----------------------------------------------------
44419


In [9]:
# Загружаем языки
langs = spark.read.format("csv").option("header", "true").load("programming-languages.csv")

In [10]:
print("Схема--------------------------------------------------------------------")
langs.printSchema()
print("Количество элементов-----------------------------------------------------")
langs.count()

Схема--------------------------------------------------------------------
root
 |-- name: string (nullable = true)
 |-- wikipedia_url: string (nullable = true)

Количество элементов-----------------------------------------------------


700

In [11]:
# Определяем функцию конвертации названия в тег
spark.udf.register("to_tag", lambda s: f"<{s}>", t.StringType())

<function __main__.<lambda>(s)>

In [12]:
# Вычисляем число упоминаний языков в разные года
lang_posts = posts.select(
    F.year(posts._CreationDate).alias("year"),
    posts._Tags.alias("tags")
).join(
    langs.select(langs.name.alias("lang"))
).where(
    F.col("tags").contains(F.call_function("to_tag", F.lower(F.col("lang"))))
).groupBy(
    F.col("year"), F.col("lang")
).agg(
    F.count(F.col("lang")).alias("ref_count")
).orderBy(F.col("year"), F.col("ref_count").desc())

In [13]:
# Создаем отчет из 10 наиболее популярных языков в определенный год
report = lang_posts.limit(10)
for year in range(2011, 2021):
  report = report.union(lang_posts.where(lang_posts.year == year).limit(10))

report.show(100)

+----+-----------+---------+
|year|       lang|ref_count|
+----+-----------+---------+
|2010|       Java|       52|
|2010|        PHP|       46|
|2010| JavaScript|       44|
|2010|     Python|       26|
|2010|Objective-C|       23|
|2010|          C|       20|
|2010|       Ruby|       12|
|2010|     Delphi|        8|
|2010|          R|        3|
|2010|AppleScript|        3|
|2011|        PHP|      102|
|2011|       Java|       93|
|2011| JavaScript|       83|
|2011|     Python|       37|
|2011|Objective-C|       34|
|2011|          C|       24|
|2011|       Ruby|       20|
|2011|       Perl|        9|
|2011|     Delphi|        8|
|2011|       Bash|        7|
|2012|        PHP|      154|
|2012| JavaScript|      132|
|2012|       Java|      124|
|2012|     Python|       69|
|2012|Objective-C|       45|
|2012|          C|       27|
|2012|       Ruby|       27|
|2012|       Bash|       10|
|2012|          R|        9|
|2012|     MATLAB|        6|
|2013|        PHP|      198|
|2013| JavaScr

In [14]:
# Сохраняем отчет
report.write.parquet("report.parquet")