In [None]:
import pandas as pd

import datetime
import json
import os
import re

from tqdm import tqdm_notebook as tqdm
from functools import partial
from pprint import pprint

In [None]:
def load_messages(path, fix_mojibake_escapes, from_time=None):
    with open(path,'rb') as bdata:
        data = fix_mojibake_escapes(bdata.read())
        data = json.loads(data, encoding='utf8', strict=False)
        conversation_dict = {'sender_name': [], 'time': [], 'content': []} 
        
        for msg in data['messages']:
            try:
                message_date = datetime.datetime.utcfromtimestamp(int(msg['timestamp_ms']/1000))
                if from_time and message_date < from_time:  # older than specified
                    break
                    
                sender_name = msg['sender_name']
                content = msg['content']
            except KeyError:
                continue
                
            conversation_dict['time'].append(message_date.strftime('%Y-%m-%d %H:%M:%S'))
            conversation_dict['sender_name'].append(sender_name)
            conversation_dict['content'].append(content)
        return pd.DataFrame.from_dict(conversation_dict)


def load(inbox_path, from_time=None):
    conversations = []
    for filename in tqdm(os.listdir(inbox_path)):
        # filename = os.fsdecode(filename)
        full_path = os.path.join(inbox_path, filename, 'message.json')
        if not os.path.exists(full_path):
            continue
        
        fix_mojibake_escapes = partial(
            re.compile(rb'\\u00([\da-f]{2})').sub,
            lambda m: bytes.fromhex(m.group(1).decode()))
        
        conversation = load_messages(full_path, fix_mojibake_escapes, from_time)
        conversations.append(conversation)
    
    return pd.concat(conversations, ignore_index=True)

In [None]:
# fromt = '2019-03-10 18:42:00'
# fromt = datetime.datetime.strptime(fromt, '%Y-%m-%d %H:%M:%S')
# df = load('./messages/inbox', fromt)

df = load('./messages/inbox')

In [None]:
len(df)

In [None]:
df.head()

In [None]:
df['sender_name'].unique()

In [None]:
df.to_csv('messages.csv', index=False)