In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os
import json
from matplotlib.dates import date2num
from datetime import datetime
from matplotlib.dates import DateFormatter
from matplotlib.ticker import FuncFormatter
from collections import Counter
import re

In [None]:
username = "" #your facebook username
_message_dir = "facebook-"+username+"/messages/inbox/"
name = "" #your facebook name
TIME_OF_DAY = ["midnight", "1am", "2am", "3am", "4am", "5am", "6am", "7am", "8am", "9am", "10am", "11am", 
                        "noon", "1pm", "2pm", "3pm", "4pm", "5pm", "6pm", "7pm", "8pm", "9pm", "10pm", "11pm", None, None]

In [None]:
people = {}
for folder in os.listdir(_message_dir):
    folder_name = folder.split("_")[0]
    with open(_message_dir + folder + "/message.json") as f:
        j = json.load(f)
    g_name = ", ".join([i['name'] for i in j['participants'] if not i['name'] == name])
    if g_name.lower().replace(" ", "") != folder_name: 
        g_name = folder_name
    people[g_name] = folder

In [None]:
def retrieve_timestamps(person):
    with open(_message_dir + people[person] + "/message.json") as f:
        j = json.load(f)
    
    timestamps = [(message['sender_name'], datetime.fromtimestamp(message['timestamp_ms']/1000)) for message in j['messages']]
    return sorted(timestamps, key= lambda x: x[1])

def retrieve_messages(person):
    with open(_message_dir + people[person] + "/message.json") as f:
        j = json.load(f)
    
    timestamps = [(message['sender_name'], datetime.fromtimestamp(message['timestamp_ms']/1000), message['content'] if 'content' in message else '') for message in j['messages']]
    return sorted(timestamps, key= lambda x: x[1])

In [None]:
def cumulative_timestamp(timestamps, start = datetime.fromtimestamp(100000), end = datetime.now()):
    if isinstance(timestamps, str):
        timestamps = retrieve_timestamps(timestamps)
    times = date2num([time for (name, time) in timestamps if time > start and time < end])
    count = [i for i in range(1, len(times) + 1)]
    fig, ax = plt.subplots()
    plot = ax.plot(times, count)
    ax.xaxis.set_major_formatter(DateFormatter("%m/%y")); 
    return plot

def YOY(timestamps, start = datetime.fromtimestamp(100000), end = datetime.now()):
    if isinstance(timestamps, str):
        timestamps = retrieve_timestamps(timestamps)
    dates = [time.date() for (name, time) in timestamps if time >= start and time < end]
    years = sorted(set([date.year for date in dates]))
    yoy_data = {}
    for year in years:
        year_data = [date.timetuple().tm_yday for date in dates if date.year == year]
        yoy_data[year] = [year_data, np.arange(1, len(year_data) + 1)]
    fig, ax = plt.subplots()
    labels = []
    lines = []
    for year in years:
        line, = ax.plot(yoy_data[year][0], yoy_data[year][1])
        lines.append(line)
        labels.append(str(year))
    ax.legend(lines, labels)
    return (fig, ax, lines)

In [None]:
#months is list of tuples where first element is 0 indexed month and second is year
def month_compare(timestamps, selected_months = []):
    if isinstance(timestamps, str):
        timestamps = retrieve_timestamps(timestamps)
    dates = [time for (name, time) in timestamps]
    data = {}
    for (month, year) in selected_months: 
        date_data = [date.day for date in dates if date.year == year and date.month == month]
        count = Counter(date_data)
        data[str(month) + "/" + str(year)] = [count[i] for i in range(0, 32) if count[i] != 0]
    fig, ax = plt.subplots()
    labels = []
    lines = []
    for month in data:
        line = ax.scatter(np.arange(1, len(data[month]) + 1), np.cumsum(data[month]))
        lines.append(line)
        labels.append(month)
    ax.legend(lines, labels)
    return (fig, ax, lines)

In [None]:
def histograph(timestamps, func = lambda x: x, start = datetime.fromtimestamp(100000), end = datetime.now()):
    if isinstance(timestamps, str):
        timestamps = retrieve_timestamps(timestamps)
    data = [func(time) for (name, time) in timestamps if time >= start and time < end]
    count = dict(sorted(Counter(data).items()))
    fig, ax = plt.subplots()
    plot = ax.plot(list(count.keys()), list(count.values()))
    return (fig, ax, plot)

def format_for_hour(fig, ax, plot):
    ax.xaxis.set_major_formatter(FuncFormatter(lambda x, pos: TIME_OF_DAY[int(x)]))
    return (fig, ax, plot)

In [None]:
person = "" #person name or groupchat (if groupchat remove all spaces in the name)

### last semester

In [None]:
format_for_hour(*histograph(retrieve_timestamps(person), func = lambda x: x.hour))
cumulative_timestamp(person)
# histograph(retrieve_timestamps(person), func = lambda x: x.date())
plt.show()

### this semester

In [None]:
format_for_hour(*histograph(retrieve_timestamps(person), start=datetime(year = 2019, day=22, month=1), func = lambda x: x.hour))
cumulative_timestamp(retrieve_timestamps(person), start=datetime(year = 2019, day=22, month=1))
plt.show()

### last semester

In [None]:
format_for_hour(*histograph(retrieve_timestamps(person), start=datetime(year = 2018, day=27, month=8), end=datetime(year = 2018, day=18, month=12), func = lambda x: x.hour))
cumulative_timestamp(retrieve_timestamps(person), start=datetime(year = 2018, day=27, month=8), end=datetime(year = 2018, day=18, month=12))
plt.show()

### summer

In [None]:
format_for_hour(*histograph(retrieve_timestamps(person), start=datetime(year = 2018, day=29, month=5), end=datetime(year = 2018, day=17, month=8), func = lambda x: x.hour))
cumulative_timestamp(retrieve_timestamps(person), start=datetime(year = 2018, day=29, month=5), end=datetime(year = 2018, day=17, month=8))
plt.show()

In [None]:
YOY(retrieve_timestamps(person))
month_compare(person, selected_months = [(3, 2015), (3, 2016), (3, 2017), (3, 2018), (3, 2019)])
plt.show()

### haha distribution

In [None]:
haha_messages = []
for (sender, timestamp, content) in retrieve_messages(person):
    messages = re.findall("[aAHh]{3,}", content)
    for message in messages:
        if re.search('(.)\\1\\1', message) is None:
            haha_messages.append(message)

In [None]:
len_haha = np.array([len(haha) for haha in haha_messages])

In [None]:
plt.hist(len_haha, bins=np.arange(min(len_haha), max(len_haha)+1))
plt.show()

In [None]:
def remove_outliers_std(data, m=3):
    return data[abs(data - np.mean(data)) < m * np.std(data)]

In [None]:
len_haha = remove_outliers_std(len_haha)
plt.hist(len_haha, bins=np.arange(min(len_haha), max(len_haha)+1, 2))
plt.show()

### message response distribution

In [None]:
ts = [ts for (name, ts) in retrieve_timestamps(person)]
dist = [ (x - y).total_seconds() for (x, y) in zip(ts[1:], ts)]
plt.hist( dist, log=True )
plt.show()

In [None]:
ts = retrieve_timestamps(person)
true_dist = [ (ts2 - ts1).total_seconds() for ((name1, ts1), (name2, ts2)) in zip(ts, ts[1:]) if name1 != name2 and (ts2 - ts1).total_seconds()/60/60 < 12]
len(true_dist)/len(ts)

In [None]:
plt.hist(true_dist)
plt.show()