In [17]:
import os
import logging
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import FloatType
from logging import Logger

import pandas as pd
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import DoubleType, StringType
from timezonefinder import TimezoneFinder
from pyspark.sql.window import Window
import pandas as pd
import numpy as np

In [18]:
# DEV Constants
GEO_DIR = "/user/solovyovyu/geo.csv"
# EVENTS_DIR = "/user/solovyovyu/data/geo/events"
OUT_PATH = "/user/solovyovyu/analytics"

# PROD Constants
EVENTS_DIR = "/user/master/data/geo/events"

In [19]:
def setup_logging():
    """Configure logging"""
    logger = logging.getLogger(__name__)
    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(message)s")
    return logger

logger = setup_logging()

In [20]:
def get_timezone(lat, lng):
    if lat is None or lng is None:
        print("lat or lng is not defined")
        return None
    tf = TimezoneFinder()
    return tf.timezone_at(lat=lat, lng=lng)

get_timezone_udf = F.udf(get_timezone, StringType())

def read_events(event_type: str, events_dir: str, spark: SparkSession, logger: Logger) -> DataFrame:
    """
    Read events from parquet file and filter by event_type
    """
    try:
        df = spark.read.parquet(events_dir) \
            .where(F.col("event_type") == event_type) \
            .select(
                F.col("event.message_id"),
                F.coalesce(
                    F.col("event.message_from"),
                    F.col("event.reaction_from"),
                    F.col("event.user")
                ).alias("user_id"),
                F.coalesce(F.col("event.message_ts"), F.col("event.datetime")).alias("datetime"),
                "lat",
                "lon"
            )
        logger.info(f"Events {event_type} are read from {events_dir}.")
        return df

    except Exception as e:
        logger.error(f"Error while reading events: {e}")
        raise


def read_geo(geo_dir: str, spark: SparkSession, logger: Logger) -> DataFrame:
    """
    Read geo data from csv file. Add timezone column.
    """
    try:
        df = spark.read.options(delimiter=";", header=True).csv(geo_dir) \
            .withColumn("lat", F.regexp_replace("lat", ",", ".").cast(DoubleType())) \
            .withColumn("lng", F.regexp_replace("lng", ",", ".").cast(DoubleType())) \
            .withColumn("timezone", get_timezone_udf(F.col("lat"), F.col("lng"))) \
            .withColumnRenamed("lat", "geo_lat") \
            .withColumnRenamed("lng", "geo_lon")

        logger.info(f"Geo data is read from {geo_dir}.")
        return df

    except Exception as e:
        logger.error(f"Error while reading geo data: {e}")
        raise


def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Радиус Земли в километрах
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    
    return R * c

@pandas_udf(DoubleType())
def haversine_udf(lat1: pd.Series, lon1: pd.Series, lat2: pd.Series, lon2: pd.Series) -> pd.Series:
    return haversine(lat1, lon1, lat2, lon2)


def add_distance(events_df: DataFrame, geo_df: DataFrame, logger: Logger) -> DataFrame:
    """
    Add distants to each city from geo_df.
    """
    geo_df = F.broadcast(geo_df)

    result_df = events_df.join(geo_df, how="cross") \
        .withColumn(
            "distance",
            haversine_udf(
                F.col("lat"), F.col("lon"), F.col("geo_lat"), F.col("geo_lon")
            )
        ) \
        .select(
            "user_id",
            "message_id",
            "datetime",
            "lat",
            "lon",
            "geo_lat",
            "geo_lon",
            "distance",
            "city",
            "timezone"
        )

    return result_df


def add_closest_city(df: DataFrame, logger: Logger) -> DataFrame:
    """
    Add city that close to event.
    """
    window = Window.partitionBy("message_id", "user_id").orderBy("distance")

    result_df = df \
        .withColumn("rank", F.row_number().over(window))\
        .filter(F.col("rank") == 1) \
        .select(
            "message_id",
            "user_id",
            "datetime",
            "city",
            "timezone"
        )

    return result_df



In [21]:
spark = SparkSession.builder \
    .master("local") \
    .appName("Mart User") \
    .getOrCreate()

In [22]:
geo_df = read_geo(GEO_DIR, spark, logger)

2025-02-12 16:05:32,657 - __main__ - Geo data is read from /user/solovyovyu/geo.csv.


In [23]:
message_df = read_events("message", EVENTS_DIR, spark, logger)

25/02/12 16:05:37 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
2025-02-12 16:05:37,971 - __main__ - Events message are read from /user/master/data/geo/events.


In [24]:
messages_with_distance = add_distance(message_df, geo_df, logger)

In [25]:
messages_with_city = add_closest_city(messages_with_distance, logger)

In [26]:
window = Window.partitionBy("user_id", "city").orderBy("datetime")

home_city_df = messages_with_city \
    .withColumn("prev_city", F.lag("city").over(window)) \
    .withColumn("is_new_city", (F.col("city") != F.col("prev_city")).cast("int")) \
    .withColumn("group_id", F.sum("is_new_city").over(window)) \
    .withColumn("stay_duration", F.datediff(F.lead("datetime").over(window), F.col("datetime"))) \
    .filter(F.col("stay_duration") >= 27) \
    .groupBy("user_id", "city").agg(
        F.max("stay_duration").alias("total_stay")
    ) \
    .orderBy("total_stay", ascending=False) \
    .groupBy("user_id").agg(
        F.first("city").alias("home_city")
    )

In [27]:
mart_user = messages_with_city \
    .groupBy("user_id").agg(
        F.last("city").alias("act_city")
    ).join(home_city_df, on="user_id", how="left")

In [28]:
travel_features = messages_with_city.groupBy("user_id").agg(
    F.collect_list("city").alias("travel_array"),
    F.count("city").alias("travel_count")
)
mart_user = mart_user.join(travel_features, on="user_id", how="left")

In [29]:
window = Window.partitionBy("user_id").orderBy("datetime")

user_last_message_local_time = messages_with_city \
    .withColumn("rank", F.rank().over(window)) \
    .filter(F.col("rank") == 1) \
    .withColumn("local_time", F.from_utc_timestamp(F.col("datetime"), F.col("timezone"))) \
    .select("user_id", "local_time")

In [30]:
mart_user = mart_user.join(user_last_message_local_time, on="user_id", how="left")

In [None]:
mart_user.write.mode("overwrite").partitionBy().format("parquet").save(f"{OUT_PATH}/mart_user")

2025-02-12 16:06:02,852 - root - KeyboardInterrupt while sending command.1468]8]
Traceback (most recent call last):
  File "/home/samogonn/projects/de-project-sprint-7/.venv/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/samogonn/projects/de-project-sprint-7/.venv/lib/python3.12/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/socket.py", line 707, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
2025-02-12 16:06:02,853 - py4j.clientserver - Closing down clientserver connection
25/02/12 16:06:02 ERROR Executor: Exception in task 0.0 in stage 18.0 (TID 446)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/tmp/ipyke

KeyboardInterrupt: 