In [2]:
!pip install faker



In [3]:
import csv
from faker import Faker
import random

fake = Faker()

num_records = 100000

http_methods = ['GET', 'POST', 'PUT', 'DELETE']
response_codes = [200, 301, 404, 500]

file_path = "web_server_logs.csv"

with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['ip', 'timestamp', 'method', 'url', 'response_code', 'response_size'])

    for _ in range(num_records):
        ip = fake.ipv4()
        timestamp = fake.date_time_this_year().isoformat()
        method = random.choice(http_methods)
        url = fake.uri_path()
        response_code = random.choice(response_codes)
        response_size = random.randint(100, 10000)

        writer.writerow([ip, timestamp, method, url, response_code, response_size])

print(f"Сгенерировано {num_records} записей и сохранено в {file_path}")

Сгенерировано 100000 записей и сохранено в web_server_logs.csv


In [28]:
from types import coroutine
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, count, date_format
from pyspark.sql.functions import to_timestamp

spark = SparkSession.builder.appName("final_job_spark").getOrCreate()

df = spark.read.csv("web_server_logs.csv", header=True, inferSchema=True)
count_url_df = df.groupBy(col("ip")).agg(count(col("url")).alias("request_count")).orderBy(col("request_count").desc()).limit(10)
count_method_df = df.groupBy(col("method")).agg(count(col("method")).alias("method_count"))
method_404_df = df.filter(col("response_code") == 404)
response_size_df = df.withColumn("date", date_format(col("timestamp"), "yyyy-MM-dd")).groupBy(col("date")).agg(count(col("response_size")).alias("total_response_size")).orderBy(col("date"))

print("Top 10 active IP addresses:")
count_url_df.show()
print("Request count by HTTP method:")
count_method_df.show()
print(f"Number of 404 response codes: {method_404_df.count()}\n")
print("Total response size by day:")
response_size_df.show()




spark.stop()

Top 10 active IP addresses:
+---------------+-------------+
|             ip|request_count|
+---------------+-------------+
|  164.74.253.98|            2|
|   43.127.11.54|            1|
|  9.128.226.181|            1|
|  100.62.59.178|            1|
|164.190.156.234|            1|
|   202.2.221.68|            1|
|   78.2.215.141|            1|
| 103.131.61.226|            1|
|  223.152.34.61|            1|
| 192.158.79.187|            1|
+---------------+-------------+

Request count by HTTP method:
+------+------------+
|method|method_count|
+------+------------+
|  POST|       25102|
|DELETE|       24929|
|   PUT|       25073|
|   GET|       24896|
+------+------------+

Number of 404 response codes: 24933

Total response size by day:
+----------+-------------------+
|      date|total_response_size|
+----------+-------------------+
|2025-01-01|               1137|
|2025-01-02|               1189|
|2025-01-03|               1103|
|2025-01-04|               1165|
|2025-01-05|        