In [117]:
import os

from logger import LoggerConfig
from logging import Logger
from pyspark.sql import SparkSession, DataFrame, functions as F
from pyspark.sql.window import Window

from pyspark.sql.functions import pandas_udf
import pandas as pd
import numpy as np
from pyspark.sql.types import DoubleType, StringType

In [2]:
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3"
os.environ["YARN_CONF_DIR"] = "/etc/hadoop/conf"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
os.environ["HADOOP_CONF_DIR"] = "/etc/hadoop/conf"

In [169]:
# DEV Constants
GEO_DIR = "/user/solovyovyu/geo.csv"
EVENTS_DIR = "/user/master/data/geo/events"
OUT_PATH = "/user/solovyovyu/analytics"
# CUR_DATE = "2022-01-02"

In [4]:
logger = LoggerConfig.get_logger("Friend Recommendations")

In [119]:
def read_events(event_type: str, events_dir: str, spark: SparkSession, logger: Logger, date: str = "*") -> DataFrame:
    """
    Read events from parquet file by partitions
    """
    try:
        df = (
            spark.read.parquet(f"{events_dir}/date={date}/event_type={event_type}")
            .where(F.col("lat").isNotNull() & F.col("lon").isNotNull())
            .select(
                F.col("event.message_id"),
                F.coalesce(F.col("event.message_from"), F.col("event.reaction_from"), F.col("event.user")).alias("user_id"),
                F.col("event.message_to"),
                F.coalesce(F.col("event.message_ts"), F.col("event.datetime")).alias("datetime"),
                "lat",
                "lon",
                "event.subscription_channel",
            )
        )
        logger.info(f"Events {event_type} are read from {events_dir}.")
        return df

    except Exception as e:
        logger.error(f"Error while reading events: {e}")
        raise

        
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Радиус Земли в километрах
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    return R * c


@pandas_udf(DoubleType())
def haversine_udf(lat1: pd.Series, lon1: pd.Series, lat2: pd.Series, lon2: pd.Series) -> pd.Series:
    return haversine(lat1, lon1, lat2, lon2)

In [9]:
# Create Spark session
try:
    spark = SparkSession.builder.appName("Friend Recommendations").getOrCreate()
    logger.info("SparkSession successfully created.")
except Exception as e:
    logger.error(f"Error creating SparkSession: {e}", exc_info=True)
    sys.exit(1)

25/02/17 12:07:42 WARN Utils: Your hostname, fhmtnvvm71dpggrdk8kg resolves to a loopback address: 127.0.1.1; using 172.16.0.22 instead (on interface eth0)
25/02/17 12:07:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/17 12:07:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


2025-02-17 12:07:45,277 - Mart Zones - INFO - SparkSession successfully created.


In [None]:
subscriptions_df = read_events("subscription", EVENTS_DIR, spark, logger)

subscriptions_df.count()

25/02/17 14:53:31 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.

In [160]:
messages_df = spark.read.parquet(f"{EVENTS_DIR}/date={CUR_DATE}")

messages_df.printSchema()

root
 |-- event: struct (nullable = true)
 |    |-- admins: array (nullable = true)
 |    |    |-- element: long (containsNull = true)
 |    |-- channel_id: long (nullable = true)
 |    |-- datetime: string (nullable = true)
 |    |-- media: struct (nullable = true)
 |    |    |-- media_type: string (nullable = true)
 |    |    |-- src: string (nullable = true)
 |    |-- message: string (nullable = true)
 |    |-- message_channel_to: long (nullable = true)
 |    |-- message_from: long (nullable = true)
 |    |-- message_group: long (nullable = true)
 |    |-- message_id: long (nullable = true)
 |    |-- message_to: long (nullable = true)
 |    |-- message_ts: string (nullable = true)
 |    |-- reaction_from: string (nullable = true)
 |    |-- reaction_type: string (nullable = true)
 |    |-- subscription_channel: long (nullable = true)
 |    |-- subscription_user: string (nullable = true)
 |    |-- tags: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |   

In [161]:
user_subs_pairs = (
    subscriptions_df.alias("a")
        .join(subscriptions_df.alias("b"), on="subscription_channel")
        .filter(F.col("a.user_id") < F.col("b.user_id"))
        .select(
            F.col("a.user_id").alias("user_left"),
            F.col("b.user_id").alias("user_right"),
            F.col("a.subscription_channel")
        )
)

In [162]:
user_chats = (
    messages_df.select(
        F.col("event.message_from").alias("user_a"),
        F.col("event.message_to").alias("user_b")
    ).union(messages_df.select(
        F.col("event.message_to").alias("user_a"),
        F.col("event.message_from").alias("user_a")
    )).distinct()
)

In [163]:
friend_recommendations = (
    user_subs_pairs
    .join(user_chats, (user_subs_pairs.user_left == user_chats.user_a) & (user_subs_pairs.user_right == user_chats.user_b), "left_anti")
)

In [164]:
friend_recommendations.show()

                                                                                

+---------+----------+--------------------+
|user_left|user_right|subscription_channel|
+---------+----------+--------------------+
|    52501|     92869|              663092|
|    52501|     62096|              663092|
|    52501|     57399|              663092|
|    52957|     90674|              441039|
|    52957|      8695|              441039|
|    53221|     97429|               43023|
|    53221|     71537|               43023|
|    69558|     78890|              248680|
|    69558|      9657|              248680|
|    69558|     89537|              248680|
|    54523|     58926|              112204|
|    54523|      8465|              112204|
|    54523|     81787|              112204|
|    55894|     69575|              763412|
|    55894|      9466|              763412|
|    55894|     67179|              763412|
|    55894|     94469|              763412|
|    55894|      7969|              763412|
|    55894|      8024|              763412|
|    55894|     63505|          

In [165]:
window = Window.partitionBy("event.message_from").orderBy(F.col("event.message_ts").desc())

user_location = (
    messages_df.withColumn("rank", F.row_number().over(window))
    .filter(F.col("rank") == 1)
    .select(
        F.col("event.message_from").alias("user_id"),
        F.col("lat"),
        F.col("lon")
    )
)

In [166]:
user_location.show()



+-------+-------------------+------------------+
|user_id|                lat|               lon|
+-------+-------------------+------------------+
|   NULL|-26.472576372776835|153.33902097729592|
|    287| -33.53099196771466| 151.3645309475204|
|    301|-11.833472609334132|131.66764401033888|
|    311| -37.58271830542874|144.99881332011873|
|    319| -37.27830535736941|144.57763024012436|
|    330| -34.92303009477633|138.60480380997166|
|    348| -42.46203022753594|147.74526183973532|
|    358|-22.901596801404743|151.43839308895915|
|    405| -32.55539550611397|116.05677089761832|
|    424|-37.671066196368784| 145.9049365138819|
|    441| -42.42589851097819|147.32535603694978|
|    641| -32.28606920299048|151.75222641010603|
|    662|-27.436733573971328|152.19560825478487|
|    704|-31.030667395631074|116.11547342467769|
|    729| -37.41720166047213| 145.4600139563115|
|    787|-27.534953730073994| 153.7166065592272|
|    822|-27.413513562198915|154.13706727185019|
|    954|-33.4526251

                                                                                

In [167]:
friend_recommendations = (
    friend_recommendations
    .join(user_location, (friend_recommendations.user_left == user_location.user_id))
    .select(
        "user_left",
        F.col("lat").alias("left_lat"),
        F.col("lon").alias("left_lon"),
        "user_right"
    )
    .join(user_location, (friend_recommendations.user_right == user_location.user_id))
    .select(
        "user_left",
        F.col("left_lat"),
        F.col("left_lon"),
        "user_right",
        F.col("lat").alias("right_lat"),
        F.col("lon").alias("right_lon"),
    )
    .withColumn(
        "distance",
        haversine_udf(F.col("left_lat"), F.col("left_lon"), F.col("right_lat"), F.col("right_lon"))
    )
    .filter(F.col("distance") <= 1)
    .withColumn("processed_dttm", F.lit(CUR_DATE))
#     .select(
#         "user_left",
#         "user_right",
#         "zone_id",
#         "local_time"
#     )
)

In [168]:
friend_recommendations_w_distance.show()

                                                                                

+---------+--------+--------+----------+---------+---------+--------+
|user_left|left_lat|left_lon|user_right|right_lat|right_lon|distance|
+---------+--------+--------+----------+---------+---------+--------+
+---------+--------+--------+----------+---------+---------+--------+

