In [None]:
import findspark
import pyspark
from pyspark.sql import SparkSession, Row, Window
import pandas as pd
from collections import OrderedDict
import json, xmltodict
import pyspark.sql.functions as F
import os

In [None]:
# Create new Sparkcontext instance for analysis
findspark.init()
sc = pyspark.SparkContext(appName="FB_Messenger_Analysis", master='local[*]')
spark = SparkSession.builder.getOrCreate()

In [None]:
fb_messages_all_df = spark.read.option("header",True).csv(r"C:\Users\RajNa\Documents\GitHub\Pyspark_Message_Analysis_v4\Pyspark_Message_Analysis_v4\FB_Parsed_Messages.csv")

In [None]:
# Set the date field (hoping to move this to the Load file once I stop hitting OOM errors)

spark.conf.set("spark.sql.session.timeZone", "America/New_York")
fb_messages_all_df = fb_messages_all_df.withColumn('timestamp_date', F.to_date(F.from_unixtime(fb_messages_all_df['timestamp_ms'] / 1000)))
fb_messages_all_df = fb_messages_all_df.withColumn('timestamp_datetime', F.from_unixtime(fb_messages_all_df['timestamp_ms'] / 1000))
fb_messages_all_df = fb_messages_all_df.withColumn('timestamp_year', F.year(F.col('timestamp_date')))
spark.conf.unset("spark.sql.session.timeZone")

In [None]:
# Who have I messaged the most overall?

fb_messages_all_count_df = fb_messages_all_df.groupBy('sender_name').count()

countWindowSpec = Window.partitionBy().orderBy(F.col('count').desc())

fb_window_sender_count_df = fb_messages_all_count_df.withColumn('rank_sender', F.row_number().over(countWindowSpec)).show(25)


In [None]:
# Who have I messaged the most overall, by year, filtering out myself, across all chats

my_name = 'Raj Narayan'

fb_messages_all_count_year_df = fb_messages_all_df.filter(fb_messages_all_df.sender_name != my_name) \
.groupBy('sender_name', 'timestamp_year').count()

countWindowSpec = Window.partitionBy().orderBy(F.col('count').desc())

fb_window_sender_count_year_df = fb_messages_all_count_year_df.withColumn('rank_sender', F.row_number().over(countWindowSpec)).show(50)


In [None]:
# How many messages do I send by year?

my_name = 'Raj Narayan'

fb_messages_me_count_year_df = fb_messages_all_df.filter(fb_messages_all_df.sender_name == my_name).groupBy('sender_name', 'timestamp_year').count()

countWindowSpec = Window.partitionBy().orderBy(F.col('count').desc())

fb_window_me_count_year_df = fb_messages_me_count_year_df.withColumn('rank_sender', F.row_number().over(countWindowSpec)).show(50)


In [None]:
# How many messages do I send by year by type?

my_name = 'Raj Narayan'

fb_messages_me_type_count_year_df = fb_messages_all_df \
.filter(fb_messages_all_df.sender_name == my_name) \
.groupBy('sender_name', 'timestamp_year', 'thread_type').count()

countWindowSpec = Window.partitionBy().orderBy(F.col('count').desc())

fb_window_me_type_count_year_df = fb_messages_me_type_count_year_df.withColumn('rank_sender', F.row_number().over(countWindowSpec)).show(50)


In [None]:
# Who have I messaged the most overall, by year, filtering out myself, across all chats by type

my_name = 'Raj Narayan'

fb_messages_all_type_count_year_df = fb_messages_all_df.filter(fb_messages_all_df.sender_name != my_name) \
.groupBy('sender_name', 'timestamp_year','thread_type').count()

countWindowSpec = Window.partitionBy().orderBy(F.col('count').desc())

fb_window_sender_type_count_year_df = fb_messages_all_type_count_year_df.withColumn('rank_sender', F.row_number().over(countWindowSpec)).show(50)


In [None]:
# Who have I messaged the most overall, by year, filtering out myself and group chats

my_name = 'Raj Narayan'

fb_messages_regular_count_year_df = fb_messages_all_df.filter(fb_messages_all_df.sender_name != my_name) \
.filter(fb_messages_all_df.thread_type != 'RegularGroup') \
.groupBy('sender_name', 'timestamp_year','thread_type').count()

countWindowSpec = Window.partitionBy().orderBy(F.col('count').desc())

fb_window_regular_count_year_df = fb_messages_regular_count_year_df.withColumn('rank_sender', F.row_number().over(countWindowSpec)).show(50)


In [None]:
# Who have I messaged the most overall, by year, filtering out myself and non-group chats

my_name = 'Raj Narayan'

fb_messages_regular_group_count_year_df = fb_messages_all_df.filter(fb_messages_all_df.sender_name != my_name) \
.filter(fb_messages_all_df.thread_type != 'Regular') \
.groupBy('sender_name', 'timestamp_year','thread_type').count()

countWindowSpec = Window.partitionBy().orderBy(F.col('count').desc())

fb_window_regular_group_count_year_df = fb_messages_regular_group_count_year_df.withColumn('rank_sender', F.row_number().over(countWindowSpec)).show(50)


In [None]:
# Who have I messaged the most overall, by year, filtering out myself, across all chats, ranked by 2021

my_name = 'Raj Narayan'

fb_messages_all_count_year_pivot_df = fb_messages_all_df \
.filter(fb_messages_all_df.sender_name != my_name) \
.filter(fb_messages_all_df.timestamp_year.isNotNull()) \
.groupBy('sender_name').pivot('timestamp_year').count()

orderBy2021WindowSpec = Window.partitionBy().orderBy(F.col('2021').desc())
orderBy2020WindowSpec = Window.partitionBy().orderBy(F.col('2020').desc())
orderBy2019WindowSpec = Window.partitionBy().orderBy(F.col('2019').desc())

fb_messages_all_count_year_pivot_df \
.withColumn('2019_rank', F.row_number().over(orderBy2019WindowSpec))\
.withColumn('2020_rank', F.row_number().over(orderBy2020WindowSpec))\
.withColumn('2021_rank', F.row_number().over(orderBy2021WindowSpec))\
.show(1000)

In [None]:
# Who have I messaged the most overall, by year, filtering out myself, across regular chats, ranked by 2021

my_name = 'Raj Narayan'

fb_messages_all_count_year_pivot_df = fb_messages_all_df \
.filter(fb_messages_all_df.sender_name != my_name) \
.filter(fb_messages_all_df.timestamp_year.isNotNull()) \
.filter(fb_messages_all_df.thread_type == 'Regular') \
.groupBy('sender_name').pivot('timestamp_year').count()

orderBy2021WindowSpec = Window.partitionBy().orderBy(F.col('2021').desc())
orderBy2020WindowSpec = Window.partitionBy().orderBy(F.col('2020').desc())
orderBy2019WindowSpec = Window.partitionBy().orderBy(F.col('2019').desc())

fb_messages_all_count_year_pivot_df \
.withColumn('2019_rank', F.row_number().over(orderBy2019WindowSpec))\
.withColumn('2020_rank', F.row_number().over(orderBy2020WindowSpec))\
.withColumn('2021_rank', F.row_number().over(orderBy2021WindowSpec))\
.show(1000)