Name: Shyju Kozhisseri<br/>
ID: 309572<br/>
Group: J41323c

In [1]:
import pyspark
import pandas as pd
import numpy as np
import emoji
import re
import pyspark.sql.functions as fn
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, IntegerType
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rank, collect_list
from pyspark.sql.window import Window

In [2]:
spark = SparkSession\
    .builder \
    .appName("Task1") \
    .config('spark.ui.port', "4040")\
    .getOrCreate()

## Import Data

In [3]:
df = spark.read.json("shared_data/bigdata20/posts_api.json/*.json")

In [4]:
df.schema.names

['attachments',
 'comments',
 'copy_history',
 'copyright',
 'date',
 'from_id',
 'geo',
 'id',
 'key',
 'likes',
 'marked_as_ads',
 'owner_id',
 'post_source',
 'post_type',
 'reposts',
 'signer_id',
 'text',
 'unavailable',
 'views']

In [5]:
df.printSchema()

root
 |-- attachments: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- album: struct (nullable = true)
 |    |    |    |-- created: long (nullable = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- owner_id: long (nullable = true)
 |    |    |    |-- size: long (nullable = true)
 |    |    |    |-- thumb: struct (nullable = true)
 |    |    |    |    |-- access_key: string (nullable = true)
 |    |    |    |    |-- album_id: long (nullable = true)
 |    |    |    |    |-- date: long (nullable = true)
 |    |    |    |    |-- id: long (nullable = true)
 |    |    |    |    |-- owner_id: long (nullable = true)
 |    |    |    |    |-- sizes: array (nullable = true)
 |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |    |-- type: string (nullable = true)
 |   

## Top 20 Posts

### Likes

In [6]:
df.select("id", "likes.count").orderBy("likes.count", ascending=False).head(20)

[Row(id=32022, count=1637),
 Row(id=35068, count=1629),
 Row(id=17492, count=1516),
 Row(id=18526, count=1026),
 Row(id=19552, count=955),
 Row(id=41468, count=952),
 Row(id=19419, count=868),
 Row(id=29046, count=824),
 Row(id=32546, count=786),
 Row(id=24085, count=765),
 Row(id=40180, count=759),
 Row(id=33658, count=708),
 Row(id=13532, count=633),
 Row(id=40842, count=631),
 Row(id=35117, count=588),
 Row(id=17014, count=581),
 Row(id=19583, count=553),
 Row(id=19809, count=552),
 Row(id=27455, count=550),
 Row(id=11999, count=549)]

### Comments

In [7]:
df.select("id", "comments.count").orderBy("comments.count", ascending=False).head(20)

[Row(id=24085, count=850),
 Row(id=22540, count=250),
 Row(id=27722, count=192),
 Row(id=8285, count=148),
 Row(id=26860, count=113),
 Row(id=13571, count=107),
 Row(id=39294, count=104),
 Row(id=36680, count=96),
 Row(id=41739, count=92),
 Row(id=26006, count=92),
 Row(id=12426, count=91),
 Row(id=21499, count=88),
 Row(id=39163, count=83),
 Row(id=39407, count=83),
 Row(id=11267, count=81),
 Row(id=31548, count=80),
 Row(id=11158, count=70),
 Row(id=39082, count=67),
 Row(id=12687, count=61),
 Row(id=14602, count=61)]

### Reposts

In [8]:
df.select("id", "reposts.count").orderBy("reposts.count", ascending=False).head(20)

[Row(id=17492, count=334),
 Row(id=19552, count=246),
 Row(id=32022, count=210),
 Row(id=11842, count=129),
 Row(id=19419, count=126),
 Row(id=13532, count=110),
 Row(id=17014, count=105),
 Row(id=35068, count=101),
 Row(id=41266, count=92),
 Row(id=12593, count=90),
 Row(id=29046, count=87),
 Row(id=41468, count=85),
 Row(id=11999, count=85),
 Row(id=19809, count=84),
 Row(id=17167, count=81),
 Row(id=10833, count=78),
 Row(id=18543, count=77),
 Row(id=16596, count=76),
 Row(id=18156, count=74),
 Row(id=37262, count=71)]

## Top 20 Users

### Likes

In [9]:
df2 = spark.read.parquet("shared_data/bigdata20/posts_likes.parquet/*.parquet")
df2.printSchema()

root
 |-- itemType: string (nullable = true)
 |-- ownerId: integer (nullable = true)
 |-- itemId: integer (nullable = true)
 |-- likerId: integer (nullable = true)



In [10]:
df2.groupBy("likerId").count().orderBy("count", ascending=False).head(20)

[Row(likerId=2070090, count=4801),
 Row(likerId=2397858, count=2055),
 Row(likerId=1475301, count=1829),
 Row(likerId=18239, count=1569),
 Row(likerId=546612, count=1245),
 Row(likerId=6371, count=907),
 Row(likerId=1841959, count=746),
 Row(likerId=78440957, count=709),
 Row(likerId=120248, count=699),
 Row(likerId=40981497, count=611),
 Row(likerId=22158, count=553),
 Row(likerId=207628162, count=548),
 Row(likerId=329377723, count=504),
 Row(likerId=76071304, count=474),
 Row(likerId=14805173, count=440),
 Row(likerId=317799, count=385),
 Row(likerId=56355640, count=375),
 Row(likerId=52042971, count=338),
 Row(likerId=7437271, count=336),
 Row(likerId=136506644, count=335)]

### Reposts

In [11]:
reposts = df.select(col("id").alias("posts_id"), col("copy_history.id").alias("copy_history_id"), col("copy_history.owner_id").alias("owner_id")).where(df.copy_history.isNotNull())
reposts.select("owner_id","copy_history_id").groupBy("owner_id").count().orderBy("count", ascending=False).head(20)

[Row(owner_id=[-33773], count=186),
 Row(owner_id=[-76139618], count=144),
 Row(owner_id=[-45636106], count=95),
 Row(owner_id=[-53958282], count=70),
 Row(owner_id=[-45660640], count=52),
 Row(owner_id=[-97819925], count=46),
 Row(owner_id=[-2499902], count=44),
 Row(owner_id=[-103229636], count=28),
 Row(owner_id=[-94359346], count=26),
 Row(owner_id=[-51664920], count=24),
 Row(owner_id=[-46907025], count=23),
 Row(owner_id=[-122783310], count=22),
 Row(owner_id=[-78459300], count=21),
 Row(owner_id=[18239], count=20),
 Row(owner_id=[-57339370], count=18),
 Row(owner_id=[-45375087], count=17),
 Row(owner_id=[-23303030], count=16),
 Row(owner_id=[-644236], count=16),
 Row(owner_id=[-39268951], count=14),
 Row(owner_id=[-3900734], count=14)]

## Emoticons

In [12]:
emojis_list = map(lambda x: ''.join(x.split()), emoji.UNICODE_EMOJI['en'].keys())
emoji_keys = '|'.join(re.escape(p) for p in emojis_list)

In [13]:
def find_all_emo(plain_text):
    if plain_text is None:
        return None
    else:
        emo_list = re.findall(emoji_keys, plain_text)
        return emo_list

search_all_emojis = fn.udf(lambda y: find_all_emo(y), ArrayType(StringType()))

df_with_emojis = df.withColumn("emojis", search_all_emojis(fn.col("text")))

In [14]:
df_with_emojis.select("id","emojis").orderBy(fn.size(col("emojis")), ascending=False).head(20)

[Row(id=30016, emojis=['‼', '❗', '❗', '‼', '🇫🇷', '🇫🇷', '🔴', '🔴', '🇨🇱', '🇨🇱', '🔵', '🇨🇳', '🇨🇳', '🔴', '🔷', '🔷', '🔷', '🔷']),
 Row(id=33749, emojis=['🌎', '👩', '🏼', '💻', '👨', '🏻', '💻', '🖋', '✔', '✔', '✔', '✔', '🔵', '🔴', '🔴', '🔷', '⁉']),
 Row(id=33011, emojis=['❗', '❗', '❗', '🔆', '📸', '⛱', '❤', '👨', '👩', '👧', '👦', '🍦', '🎹', '🎷']),
 Row(id=33557, emojis=['❗', '❗', '🔹', '🔹', '🔹', '🔹', '▪', '▪', '▪', '▪', '▪']),
 Row(id=15697, emojis=['✔', '✔', '✔', '✔', '✔', '✔', '✔', '✔', '✔', '✔', '✔']),
 Row(id=34251, emojis=['📽', '🎞', '🎬', '⬇', '⬇', '⬇', '🕖', '📍', '🎬', '🕖', '📍']),
 Row(id=34575, emojis=['🇩🇪', '🇩🇪', '🇩🇪', '🔎', '💿', '⏰', '👌', '🏻', '👍', '🏼']),
 Row(id=39682, emojis=['🤔', '📌', '🔹', '🔹', '🔹', '🔹', '🔹', '🔹', '🔹', '🔹']),
 Row(id=34896, emojis=['✒', '📉', '🎓', '🔴', '🔴', '📩', '✏', '📈', '🎓', '🔵']),
 Row(id=15550, emojis=['✔', '✔', '✔', '✔', '✔', '✔', '✔', '✔', '✔', '✔']),
 Row(id=34359, emojis=['🔴', '🔴', '🔴', '🔵', '🔵', '🔵', '🔴', '🔴', '🔴']),
 Row(id=37463, emojis=['🏁', '🔵', '🔴', '🔵', '🔴', '🔵', '📍', '🕐'

## Emoji Sentiment Analysis

In [15]:
from emosent import get_emoji_sentiment_rank
from statistics import mean

def calculate_senti_score(plain_text):
    if plain_text is None or plain_text == '':
        return 0
    else:
        sentiment = []
        for item in plain_text:
            try:
                sentiment.append(np.float(get_emoji_sentiment_rank(item)['sentiment_score']))
            except:
                continue
        L = [float(n) for n in sentiment if n]
        score = sum(L)/len(L) if L else 0
        return score

find_sentiments = fn.udf(lambda y: calculate_senti_score(y), StringType())
df_with_sentiment = df_with_emojis.withColumn("sentiment_score", find_sentiments(col("emojis")))

In [16]:
df_with_sentiment.select("id","sentiment_score")\
                    .where(col('text') != '')\
                    .withColumn('emotion', fn.when(col('sentiment_score').cast('float')>0, 'Positive')\
                    .when(col('sentiment_score').cast('float')<0, 'Negative')\
                    .otherwise('Neutral'))\
                    .orderBy('sentiment_score', ascending=True)\
                    .head(10)

[Row(id=17584, sentiment_score='-0.018', emotion='Negative'),
 Row(id=36360, sentiment_score='-0.111', emotion='Negative'),
 Row(id=41316, sentiment_score='-0.122', emotion='Negative'),
 Row(id=35652, sentiment_score='-0.16449999999999998', emotion='Negative'),
 Row(id=41039, sentiment_score='-0.211', emotion='Negative'),
 Row(id=35865, sentiment_score='-0.214', emotion='Negative'),
 Row(id=41266, sentiment_score='-0.24524999999999997', emotion='Negative'),
 Row(id=40071, sentiment_score='-0.368', emotion='Negative'),
 Row(id=40301, sentiment_score='-0.4695', emotion='Negative'),
 Row(id=40090, sentiment_score='-0.571', emotion='Negative')]

## Probable Fans

In [17]:
fdf = spark.read.parquet("shared_data/bigdata20/followers_posts_likes.parquet/*.parquet")
fdf.printSchema()

root
 |-- itemType: string (nullable = true)
 |-- ownerId: integer (nullable = true)
 |-- itemId: integer (nullable = true)
 |-- likerId: integer (nullable = true)



In [18]:
def top_fans(likerId, topfdf):
    return topfdf.where(col("likerId")==likerId).orderBy("count", ascending=False).select(col("ownerId").alias("UserId"), col("count").alias("NoOfLikes")).head(10)

In [19]:
topfdf = fdf.where(col("likerId") != col("ownerId")).groupBy("likerId", "ownerId").count().orderBy(["count"], ascending=False)
print("User-14 is a probable fan of:\n")
print(top_fans('14', topfdf))

User-14 is a probable fan of:

[Row(UserId=1986125, NoOfLikes=2), Row(UserId=3680017, NoOfLikes=1), Row(UserId=35524, NoOfLikes=1), Row(UserId=3420, NoOfLikes=1)]


## Probable Friends

In [20]:
window = Window.partitionBy(topfdf['likerId']).orderBy(topfdf['count'].desc())
top_friend = topfdf.select('*', rank().over(window).alias('rank')).filter(col('rank') <= 1)
top_friend.head(10)

[Row(likerId=496, ownerId=59139083, count=1, rank=1),
 Row(likerId=2142, ownerId=5411213, count=5, rank=1),
 Row(likerId=3918, ownerId=145254284, count=1, rank=1),
 Row(likerId=7880, ownerId=2812004, count=2, rank=1),
 Row(likerId=9376, ownerId=111195, count=1, rank=1),
 Row(likerId=12046, ownerId=3824163, count=2, rank=1),
 Row(likerId=13832, ownerId=15221, count=1, rank=1),
 Row(likerId=18944, ownerId=15221, count=1, rank=1),
 Row(likerId=20135, ownerId=75791, count=3, rank=1),
 Row(likerId=20683, ownerId=591512, count=1, rank=1)]

In [21]:
temp = top_friend.select("likerId", "ownerId")
friends_list = temp.alias('left').join(temp.alias('right'), col('right.likerId') == col('left.ownerId'), how="inner").filter(col('left.likerId') == col('right.ownerId')).select(col('left.likerId'),col('left.ownerId'))
friends_list.head(10)

[Row(likerId=370123, ownerId=588552),
 Row(likerId=80734900, ownerId=306995029),
 Row(likerId=182703092, ownerId=158548069),
 Row(likerId=217400123, ownerId=272787573),
 Row(likerId=240137155, ownerId=327458),
 Row(likerId=272076217, ownerId=209077977),
 Row(likerId=3670245, ownerId=203962500),
 Row(likerId=3670245, ownerId=79962265),
 Row(likerId=4428906, ownerId=126259762),
 Row(likerId=230853332, ownerId=282798485)]

## Stop Spark

In [22]:
spark.stop()