In [1]:
import os

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType
from pyspark.sql.window import Window

In [2]:
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3"
os.environ["YARN_CONF_DIR"] = "/etc/hadoop/conf"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
os.environ["HADOOP_CONF_DIR"] = "/etc/hadoop/conf"

In [3]:
GEO_DIR = "/user/solovyovyu/geo.csv"
EVENTS_DIR = "/user/solovyovyu/data/geo/events"

In [4]:
spark = SparkSession.builder.master("yarn").appName("Test").getOrCreate()

25/01/13 13:46:39 WARN Utils: Your hostname, fhm8qh14n4dtclu7ncp2 resolves to a loopback address: 127.0.1.1; using 172.16.0.27 instead (on interface eth0)
25/01/13 13:46:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/13 13:46:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/13 13:46:42 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [5]:
geo_df = spark.read.options(delimiter=";", header=True).csv(GEO_DIR)

                                                                                

In [6]:
geo_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)



In [7]:
events_df = spark.read.parquet(EVENTS_DIR).where("event_type = 'message'")

                                                                                

In [8]:
events_df.printSchema()

root
 |-- event: struct (nullable = true)
 |    |-- admins: array (nullable = true)
 |    |    |-- element: long (containsNull = true)
 |    |-- channel_id: long (nullable = true)
 |    |-- datetime: string (nullable = true)
 |    |-- media: struct (nullable = true)
 |    |    |-- media_type: string (nullable = true)
 |    |    |-- src: string (nullable = true)
 |    |-- message: string (nullable = true)
 |    |-- message_channel_to: long (nullable = true)
 |    |-- message_from: long (nullable = true)
 |    |-- message_group: long (nullable = true)
 |    |-- message_id: long (nullable = true)
 |    |-- message_to: long (nullable = true)
 |    |-- message_ts: string (nullable = true)
 |    |-- reaction_from: string (nullable = true)
 |    |-- reaction_type: string (nullable = true)
 |    |-- subscription_channel: long (nullable = true)
 |    |-- subscription_user: string (nullable = true)
 |    |-- tags: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |   

In [9]:
events_df.count()

                                                                                

2646

In [10]:
def get_distance(lat1, lon1, lat2, lon2):
    R = 6371
    d_lat = F.radians(lat2 - lat1)
    d_lon = F.radians(lon2 - lon1)

    a = F.sin(d_lat / 2) * F.sin(d_lat / 2) + F.cos(F.radians(lat1)) * F.cos(F.radians(lat2)) * F.sin(d_lon / 2) * F.sin(d_lon / 2)
    c = 2 * F.atan2(F.sqrt(a), F.sqrt(1 - a))
    return R * c

In [11]:
geo_df = geo_df.withColumn("lat", F.regexp_replace("lat", ",", ".").cast(DoubleType())) \
               .withColumn("lng", F.regexp_replace("lng", ",", ".").cast(DoubleType()))

In [12]:
geo_df = geo_df.select(
    F.col("id").alias("geo_id"),
    F.col("city").alias("geo_city"),
    F.col("lat").alias("geo_lat"),
    F.col("lng").alias("geo_lng")
)

In [13]:
cross_joined = events_df.crossJoin(geo_df)

In [14]:
cross_joined.show()

[Stage 6:>                                                          (0 + 1) / 1]

+--------------------+------------------+------------------+----------+----------+------+----------+--------+--------+
|               event|               lat|               lon|      date|event_type|geo_id|  geo_city| geo_lat| geo_lng|
+--------------------+------------------+------------------+----------+----------+------+----------+--------+--------+
|{NULL, NULL, NULL...|-34.36705259718115|149.37469432633242|2022-05-03|   message|     1|    Sydney| -33.865|151.2094|
|{NULL, NULL, NULL...|-34.36705259718115|149.37469432633242|2022-05-03|   message|     2| Melbourne|-37.8136|144.9631|
|{NULL, NULL, NULL...|-34.36705259718115|149.37469432633242|2022-05-03|   message|     3|  Brisbane|-27.4678|153.0281|
|{NULL, NULL, NULL...|-34.36705259718115|149.37469432633242|2022-05-03|   message|     4|     Perth|-31.9522|115.8589|
|{NULL, NULL, NULL...|-34.36705259718115|149.37469432633242|2022-05-03|   message|     5|  Adelaide|-34.9289|138.6011|
|{NULL, NULL, NULL...|-34.36705259718115|149.374

                                                                                

In [15]:
with_distances = cross_joined.withColumn(
    "distance",
    get_distance(F.col("lat"), F.col("lon"), F.col("geo_lat"), F.col("geo_lng"))
)

In [16]:
with_distances.show()

+--------------------+------------------+------------------+----------+----------+------+----------+--------+--------+------------------+
|               event|               lat|               lon|      date|event_type|geo_id|  geo_city| geo_lat| geo_lng|          distance|
+--------------------+------------------+------------------+----------+----------+------+----------+--------+--------+------------------+
|{NULL, NULL, NULL...|-34.36705259718115|149.37469432633242|2022-05-03|   message|     1|    Sydney| -33.865|151.2094|177883.97100018326|
|{NULL, NULL, NULL...|-34.36705259718115|149.37469432633242|2022-05-03|   message|     2| Melbourne|-37.8136|144.9631|  551234.453292873|
|{NULL, NULL, NULL...|-34.36705259718115|149.37469432633242|2022-05-03|   message|     3|  Brisbane|-27.4678|153.0281| 842428.4139122481|
|{NULL, NULL, NULL...|-34.36705259718115|149.37469432633242|2022-05-03|   message|     4|     Perth|-31.9522|115.8589|3117318.1992193977|
|{NULL, NULL, NULL...|-34.36705259

In [17]:
window = Window.partitionBy("event").orderBy("distance")

In [18]:
closest_cities = with_distances\
    .withColumn("rank", F.row_number().over(window))\
    .filter(F.col("rank") == 1)

In [19]:
result = closest_cities.select(
    F.col("event.*"),  # данные события
    F.col("geo_city").alias("closest_city"),  # ближайший город
    F.col("distance")  # расстояние до города
)

# Показываем результат
result.show(truncate=False)



+------+----------+--------+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+------------+-------------+----------+----------+-----------------------------+-------------+-------------+--------------------+-----------------+----+----+------------+------------------+
|admins|channel_id|datetime|media|message                                                                                                                                                                                                                                                           |message_channel_to|message_from|message_group|message_id|message_to|message_ts                   |reaction_from|reaction_type|subscription_channel|subscription_user|tags|user|closest_city|distance          

                                                                                