In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date

# Создание SparkSession
spark = SparkSession.builder.appName("Read CSV Example").getOrCreate()

# Чтение CSV-файла
df = spark.read.csv("web_server_logs.csv", header=True, inferSchema=True)

# 1
ip_grouped = df.groupBy("ip").agg({"method": "count"}).withColumnRenamed("count(method)", "request_count")
print("Top 10 active IP-addresses:")
ip_grouped.show(10)

# 2
method_grouped = df.groupBy("method").agg({"method": "count"}).withColumnRenamed("count(method)", "method_count")
print("Request count by HTTP method:")
method_grouped.show()

# 3
errors = df.filter(df["response_code"] == "404").count()
print("404 responses: ", errors, "\n")

# 4
df = df.withColumn("timestamp", to_date(col("timestamp")))
date_grouped = df.groupBy("timestamp").agg({"response_size": "sum"}).withColumnRenamed(
    "sum(response_size)", "total_response_size").orderBy(col("timestamp"))
print("Total response size by day:")
date_grouped.show()

spark.stop()

Top 10 active IP-addresses:
+--------------+-------------+
|            ip|request_count|
+--------------+-------------+
|136.19.117.236|            1|
| 48.134.93.249|            1|
| 42.66.100.116|            1|
|   205.9.226.8|            1|
|25.189.170.104|            1|
|117.132.141.27|            1|
|  168.55.48.81|            1|
|   41.27.30.81|            1|
|116.52.163.180|            1|
| 152.247.37.68|            1|
+--------------+-------------+
only showing top 10 rows

Request count by HTTP method:
+------+------------+
|method|method_count|
+------+------------+
|  POST|       24941|
|DELETE|       25037|
|   PUT|       24922|
|   GET|       25100|
+------+------------+

404 responses:  24870 

Total response size by day:
+----------+-------------------+
| timestamp|total_response_size|
+----------+-------------------+
|2024-01-01|            1445099|
|2024-01-02|            1581598|
|2024-01-03|            1534305|
|2024-01-04|            1565388|
|2024-01-05|          