In [None]:
import findspark
import pyspark
from pyspark.sql import SparkSession, Row, Window
import pandas as pd
from collections import OrderedDict
import json, xmltodict
import pyspark.sql.functions as F
import os

In [None]:
# Create new Sparkcontext instance for analysis
findspark.init()
sc = pyspark.SparkContext(appName="FB_Messenger")
spark = SparkSession.builder.getOrCreate()

In [None]:
# New dataframe to append all FB data to
fb_message_cols = ["title", "thread_type", "sender_name", "content", "timestamp_ms"]
fb_message_default_vals = [('Title', 'Type', 'Sender', 'Message', 1234)]

fb_messages_df = spark.createDataFrame(fb_message_default_vals,fb_message_cols)

In [None]:
# Parse through each FB folder to compile the messages


# Path pointing at FB messages main directory
path_of_the_fb_message_inbox = r'C:\Users\RajNa\Downloads\FB_Raj_12212021\messages\inbox'


# Parse through each folder in the Facebook Inbox folder
for fb_chat in os.listdir(path_of_the_fb_message_inbox):
    
        # Parse through each subfolder (representing each indvidual or group chat)
        for fb_chat_details in os.listdir(os.path.join(path_of_the_fb_message_inbox,fb_chat)):
            
            # Only process message_ files (not photos, videos, etc)
            if 'message_8' in fb_chat_details:
                
                # Read in entire JSON file (multiline)
                fb_chat_df = spark.read.option("multiline","true").json(os.path.join(path_of_the_fb_message_inbox,fb_chat, fb_chat_details))
                
                # Drop the columns we don't need
                fb_chat_df = fb_chat_df.drop('is_still_participant','magic_words','thread_path')
                
                # Save a variable with chat title (username or groupchat name)
                chat_title = fb_chat_df.first()['title']
                
                # Save a variable with thread type (regular, regulargroup, etc)
                chat_thread_type = fb_chat_df.first()['thread_type']
                
                # For each message in that file (individual JSON row)
                for message in fb_chat_df.first()["messages"]:
                    
                    # Save the content of the message into a variable, but if it's empty (ex: a photo) save an empty string
                    if message['content']:
                        message_content = message['content']
                    else:
                        message_content = ''
                    
                    # Save the sender of the message's name into a variable
                    message_sender_name = message['sender_name']
                    
                    # Save the timestamp (in ms) to a column to be converted later
                    message_timestamp_ms = message['timestamp_ms']
                    
                    # Create a new df using all the above data points
                    message_append_row = spark.createDataFrame([(chat_title, chat_thread_type, message_sender_name, message_content, message_timestamp_ms)], fb_message_cols)
                    
                    # Add this new df from above to a master df with all messages
                    fb_messages_df = fb_messages_df.union(message_append_row)
                    print(fb_messages_df.show())
                
                # Fields in Messages Node: content, sender_name, timestamp_ms + is_unsent, photos, reactions, share, sticker, type, users
                
# Save df to csv file
fb_messages_df.toPandas().to_csv('fb_message_df.csv')

In [None]:
# Test code to understand schema
fb_chat_df.first()["messages"][0]
#fb_messages_df.first()['title']
#fb_messages_df.first()['thread_type']
for i in fb_chat_df.first()["messages"]:
    if i['content']:
        print(i['content'])