In [19]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark.sql.functions import * 
from pyspark.sql.types import * 
import pandas as pd
import os

In [20]:
sc = SparkSession.builder.appName("PysparkLab4") \
    .config ("spark.sql.shuffle.partitions", "20") \
    .config ("spark.default.parallelism", "20") \
    .config("spark.driver.maxResultSize","5g") \
    .config ("spark.sql.execution.arrow.enabled", "true") \
    .getOrCreate()

In [21]:
posts_data_dir = r"G:\Data\spark_labs\bigdata20\bigdata20\followers_posts_api_final.json"
result_dir = r"G:\Data\spark_labs\bigdata20\bigdata20\results\task4"

posts1_path = os.path.join(posts_data_dir, '0_39773a62_followers_posts_api_final.json')
posts2_path = os.path.join(posts_data_dir, '1_7a8a2098_followers_posts_api_final.json')

posts1_df = sc.read.json(posts1_path)
posts2_df = sc.read.json(posts2_path)

posts1_df = posts1_df.filter(posts1_df.post_type == "post")
posts2_df = posts2_df.filter(posts2_df.post_type == "post")

In [22]:
posts1_df.columns

['attachments',
 'comments',
 'copy_history',
 'copyright',
 'date',
 'final_post',
 'from_id',
 'geo',
 'id',
 'is_pinned',
 'key',
 'likes',
 'owner_id',
 'post_source',
 'post_type',
 'reposts',
 'signer_id',
 'text',
 'unavailable',
 'views']

In [23]:
# https://www.kaggle.com/thomasseleck/emoji-sentiment-data
emoji_data = pd.read_csv(r'G:/Data/Emoji_Sentiment_Data_v1.0.csv')
emoji_data.head()

Unnamed: 0,Emoji,Unicode codepoint,Occurrences,Position,Negative,Neutral,Positive,Unicode name,Unicode block
0,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons
1,❤,0x2764,8050,0.746943,355,1334,6361,HEAVY BLACK HEART,Dingbats
2,♥,0x2665,7144,0.753806,252,1942,4950,BLACK HEART SUIT,Miscellaneous Symbols
3,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons
4,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons


In [24]:
# top 20 "positive" emojis
positive_emojis = emoji_data[(emoji_data['Positive'] > emoji_data['Neutral']) & (emoji_data['Positive'] > emoji_data['Negative'])].head(20)['Emoji'].tolist()

# top 20 "negative" emojis
negative_emojis = emoji_data[(emoji_data['Negative'] > emoji_data['Neutral']) & (emoji_data['Negative'] > emoji_data['Positive'])].head(20)['Emoji'].tolist()

In [35]:
positive_emojis

['😂',
 '❤',
 '♥',
 '😍',
 '😘',
 '😊',
 '👌',
 '💕',
 '👏',
 '😁',
 '☺',
 '♡',
 '👍',
 '🙏',
 '✌',
 '😉',
 '🙌',
 '🙈',
 '💪',
 '😄']

In [36]:
negative_emojis

['😭',
 '😩',
 '😒',
 '😔',
 '😡',
 '😴',
 '🔫',
 '😞',
 '😪',
 '😫',
 '💀',
 '😕',
 '💔',
 '😤',
 '😰',
 '😑',
 '😠',
 '😓',
 '😣',
 '😐']

In [25]:
from pyspark.sql.types import StringType
import emoji
# posts1_df = posts1_df.withColumn('comments_count', lit(comments_num_udf('comments')))
# posts1_df['comments_count']

def count_emojis_positive(text):
    emojis = [c for c in text if c in positive_emojis]
    return len(emojis)
        
def count_emojis_negative(text):
    emojis = [c for c in text if c in negative_emojis]
    return len(emojis)
    
positive_num_udf = udf(count_emojis_positive, IntegerType())
negative_num_udf = udf(count_emojis_negative, IntegerType())

In [26]:
### 
posts1_df = posts1_df.withColumn('positive_emojis_count', positive_num_udf('text'))
posts1_df = posts1_df.withColumn('negative_emojis_count', negative_num_udf('text'))

posts2_df = posts2_df.withColumn('positive_emojis_count', positive_num_udf('text'))
posts2_df = posts2_df.withColumn('negative_emojis_count', negative_num_udf('text'))

In [27]:
### apply functions 
posts1_df.createOrReplaceTempView("df1_view")
posts2_df.createOrReplaceTempView("df2_view")
positive_count = sc.sql("""select text, id, positive_emojis_count, negative_emojis_count 
                            from df1_view 
                            UNION ALL 
                            select text, id, positive_emojis_count, negative_emojis_count 
                            from df2_view
                            ORDER BY positive_emojis_count desc
                            LIMIT 20
                            """)

In [28]:
positive_count.show()
positive_count.toPandas().to_json(os.path.join(result_dir, 'positive_count.json'))

+--------------------+------+---------------------+---------------------+
|                text|    id|positive_emojis_count|negative_emojis_count|
+--------------------+------+---------------------+---------------------+
|❤️❤️❤️❤️❤️❤️❤️❤️❤...| 12192|                  124|                    0|
|аск
1).На два😂 М...|137162|                  121|                    8|
|аск
1).Пюрешкой, ...|137951|                  116|                    5|
|аск
1).Я про то, ...|140335|                  113|                    2|
|аск
1).Весь инбок...|136714|                  110|                    0|
|аск
1).Ты любишь ...|138511|                  107|                    0|
|Я хочу чтобы ты т...| 52635|                  103|                    0|
|аск
1).😱😱😱😱
б...|145395|                  103|                    0|
|аск
1).когда лети...|146864|                   99|                   14|
|аск
1).В коллаже?...|145529|                   95|                    4|
|аск
1).А в скольк...|142020|              

  PyArrow >= 0.15.1 must be installed; however, it was not found.
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.


In [29]:
negative_count = sc.sql("""select text, id, positive_emojis_count, negative_emojis_count 
                            from df1_view 
                            UNION ALL 
                            select text, id, positive_emojis_count, negative_emojis_count 
                            from df2_view
                            ORDER BY negative_emojis_count desc
                            LIMIT 20
                            """)

In [30]:
negative_count.show()
negative_count.toPandas().to_json(os.path.join(result_dir, 'negative_count.json'))

+--------------------+------+---------------------+---------------------+
|                text|    id|positive_emojis_count|negative_emojis_count|
+--------------------+------+---------------------+---------------------+
|Завтра утром эта ...|  1068|                    1|                   31|
|аск
1).Так завидо...|141573|                   66|                   20|
|РОВНО МЕСЯЦ ДО ЭК...|  2276|                    0|                   16|
|аск
1).когда лети...|146864|                   99|                   14|
|Ребят, вы совсем ...|  2477|                   16|                   13|
|каждый день в шко...|  2613|                    0|                   12|
|Некоторые люди не...| 12085|                    6|                   12|
|Вот реально, что ...|   572|                    0|                   12|
|Znachit tak! 😝😝...| 21731|                    5|                   11|
|😱😭😭😭😭😭😭😭?...| 43355|                    0|                   11|
|МАМ СМОТРИ, ЭТО С...|    40|                   

  PyArrow >= 0.15.1 must be installed; however, it was not found.
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
