In [6]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark.sql.functions import * 
from pyspark.sql.types import * 
import pandas as pd
import os

In [7]:
import pyspark
print(pyspark.__version__)

3.0.1


In [8]:
sc = SparkSession.builder.appName("PysparkLab1") \
    .config ("spark.sql.shuffle.partitions", "50") \
    .config("spark.driver.maxResultSize","5g") \
    .config ("spark.sql.execution.arrow.enabled", "true") \
    .getOrCreate()

In [9]:
data_dir = r"G:\Data\spark_labs\bigdata20\bigdata20\followers_posts_api_final.json"
result_dir = r"G:\Data\spark_labs\bigdata20\bigdata20\results\task1"

posts1_path = os.path.join(data_dir, '0_39773a62_followers_posts_api_final.json')
posts2_path = os.path.join(data_dir, '1_7a8a2098_followers_posts_api_final.json')
posts1_df = sc.read.json(posts1_path)
posts2_df = sc.read.json(posts2_path)

In [10]:
posts1_df.show(10)

+--------------------+------------+------------+---------+----------+----------+-------+----+-----+---------+-----------+-------------+--------+-----------------+---------+-------+---------+--------------------+-----------+-----+
|         attachments|    comments|copy_history|copyright|      date|final_post|from_id| geo|   id|is_pinned|        key|        likes|owner_id|      post_source|post_type|reposts|signer_id|                text|unavailable|views|
+--------------------+------------+------------+---------+----------+----------+-------+----+-----+---------+-----------+-------------+--------+-----------------+---------+-------+---------+--------------------+-----------+-----+
|[[,,,,,,,,,, vide...|[0, 0, true]|        null|     null|1550165023|      null|  87449|null| 3316|     null| 87449_3316| [1, 1, 6, 0]|   87449|[,, iphone, api,]|     post| [0, 0]|     null|Я люблю Вас. Я вч...|       null|[428]|
|[[,,,,,, [fe02668...|[0, 0, true]|        null|     null|1553774858|      null|

In [11]:
### From the first dataset
posts1_df.createOrReplaceTempView("df1_view")
top_commented1 = sc.sql("select id, comments, likes, text, views, reposts from df1_view ORDER BY comments.count desc LIMIT 20")
top_liked1 = sc.sql("select id, comments, likes, text, views, reposts from df1_view ORDER BY likes.count desc LIMIT 20")
top_reposted1 = sc.sql("select id, comments, likes, text, views, reposts from df1_view ORDER BY reposts.count desc LIMIT 20")

In [12]:
#top_commented1.show(truncate=False)
top_liked1.show()

+------+--------------+---------------+--------------------+-------+--------+
|    id|      comments|          likes|                text|  views| reposts|
+------+--------------+---------------+--------------------+-------+--------+
|425873|[1, 271, true]|[1, 1, 3271, 0]|Лайкнувшим отправ...|[30376]|  [6, 0]|
|311504|[1, 516, true]|[1, 1, 2160, 0]|                    | [7479]|  [3, 0]|
|340354|[1, 429, true]|[1, 1, 2148, 0]|                    |[11353]|  [1, 0]|
| 31472|  [0, 0, true]|[1, 1, 2099, 0]|                    | [3271]|  [0, 0]|
|428224| [1, 78, true]|[1, 1, 1914, 0]|лайк кого пролайк...|[14640]|  [5, 0]|
|322736| [1, 76, true]|[1, 1, 1665, 0]|                🙈🙉|[19461]|  [2, 0]|
|  4004|  [0, 0, true]|[1, 1, 1593, 0]|Нашли корги. Бега...|[59135]|[573, 0]|
| 32164|  [0, 0, true]|[1, 1, 1574, 0]|                    | [2481]|  [2, 0]|
|  5161|  [0, 0, true]|[1, 1, 1494, 0]|Друзья, пора нача...| [2166]|  [0, 0]|
|322699| [1, 31, true]|[1, 1, 1410, 0]|Мы всегда остаемс...|[12495

In [13]:
### From the second dataset
posts2_df.createOrReplaceTempView("df2_view")
top_commented2 = sc.sql("select id, comments, likes, text, views, reposts from df2_view ORDER BY comments.count desc LIMIT 20")
top_liked2 = sc.sql("select id, comments, likes, text, views, reposts from df2_view ORDER BY likes.count desc LIMIT 20")
top_reposted2 = sc.sql("select id, comments, likes, text, views, reposts from df2_view ORDER BY reposts.count desc LIMIT 20")

In [14]:
#top_commented2.show()

In [15]:
top_commented = top_commented1.union(top_commented2).orderBy('comments.count', ascending=False).limit(20)
top_liked = top_liked1.union(top_liked2).orderBy('likes.count', ascending=False).limit(20)
top_reposted = top_reposted1.union(top_reposted2).orderBy('reposts.count', ascending=False).limit(20)

In [16]:
top_commented.show()
# top_commented.write.json(os.path.join(result_dir, 'top_commented.json'))
top_commented.toPandas().to_json(os.path.join(result_dir, 'top_commented.json'))

+------+----------------+--------------+--------------------+-------+-------+
|    id|        comments|         likes|                text|  views|reposts|
+------+----------------+--------------+--------------------+-------+-------+
|215530|[1, 18189, true]|[1, 1, 163, 0]|   Гости у bera vine|[10285]| [0, 0]|
|118126|[1, 17041, true]|[1, 1, 227, 0]|                Live|[12404]| [1, 0]|
|234835|[1, 12841, true]|[1, 1, 231, 0]|       Погнали💥💥💥|[11477]| [2, 0]|
|169456|[1, 12240, true]|[1, 1, 135, 0]|          Звоночки💥| [6551]| [0, 0]|
| 57446|[1, 11596, true]|[1, 1, 176, 0]|Ребят,как и говор...|[12219]| [1, 0]|
|372853|[1, 10812, true]|[1, 1, 384, 0]|   Всем приветик💋🖤|[14391]| [6, 0]|
|255172|[1, 10571, true]|[1, 1, 177, 0]|           Погнали☄️| [9523]| [1, 0]|
|142040|[1, 10155, true]|[1, 1, 194, 0]| Сегодня будет мощно|[12356]| [0, 0]|
|361645| [1, 7982, true]|[1, 1, 300, 0]|В конце будут сиг...| [9352]| [5, 0]|
|274765| [1, 7940, true]|[1, 1, 191, 0]|Bera vine в сборе...|[10715]| 

  PyArrow >= 0.15.1 must be installed; however, it was not found.
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.


In [None]:
top_liked.show()
top_liked.toPandas().to_json(os.path.join(result_dir, 'top_liked.json'))

In [None]:
top_reposted.show()
top_reposted.toPandas().to_json(os.path.join(result_dir, 'top_reposted.json'))

In [None]:
# from pyspark.sql.types import IntegerType
# comments_num_udf = udf(lambda row: int(row['count']), IntegerType())
# posts1_df = posts1_df.withColumn('comments_count', lit(comments_num_udf('comments')))
# posts1_df['comments_count']

In [None]:
#posts1_df.createOrReplaceTempView("temp")
#sc.sql("select comments from temp ORDER BY comments desc").show(truncate=False)

In [None]:
#rddjson = sc.parallelize([simple_json])
#df = sqlContext.read.json(rddjson)