In [188]:
import numpy as np
import json
import re
from collections import defaultdict
from matplotlib import pyplot as plt
%matplotlib inline

In [20]:
MESSAGES_JSON_PATH = 'data/message_1.json'

In [267]:
def load_messages(high_unicode=False):
    """
    Return an iterator with sender names and message content.
    """
    with open(MESSAGES_JSON_PATH, 'r') as json_file:
        for message in json.loads(
                json_file
                .read())['messages']:
            if message['type'] == 'Generic':
                try:
                    sender_name = message['sender_name']
                    content = message['content']                   
                except KeyError:
                    continue
                # clobber the emojis; they're hard to work with
                try:
                    if high_unicode:
                        content = [c for c in content
                                   if ord(c) > 126]
                    else:
                        content = content.encode('ascii',
                                                 errors='ignore')
                        content = str(content,
                                      encoding='ascii')
                except ValueError:
                    continue
                if content:
                    yield sender_name, content

In [152]:
def get_all_messages():
    yield from (msg for _, msg in load_messages())

In [230]:
def get_corpus_by_sender():
    corpus_by_sender = defaultdict(list)
    for sender, msg in load_messages():
        corpus_by_sender[sender].append(msg)
    return corpus_by_sender

In [192]:
def get_tags():
    """Returns a defaultdict tally of all substrings of the
        form`@spam`."""
    tag_matcher = re.compile('\@[A-Za-z]+')
    tags = defaultdict(int)
    for msg in get_all_messages():
        for tag in tag_matcher.findall(msg):
            tags[tag] += 1
    return tags

In [241]:
def ordered_dict(d):
    return sorted(((v, k) 
                   for (k, v)
                   in d.items()),
                  reverse=True)

In [234]:
def get_lemma_frequencies():
    """Return a dict()
       with log frequencies of all word lemmas."""
    # tally
    word_matcher = re.compile("[A-Za-z']+")
    tally = defaultdict(int)
    total = 0
    for msg in get_all_messages():
        for match in word_matcher.findall(msg):
            total += 1
            lemma = match.lower()
            tally[lemma] += 1
    return {lemma: count / total
            for lemma, count
            in tally.items()
            if count > 1}

In [237]:
def get_English_frequencies():
    kv_line_matcher = re.compile(r"^\s*(\w+),(\w+)$",
                                 re.MULTILINE)
    tally = dict()
    total = 0
    with open('data/english_counts.csv', 'r') as f:
        csv_text = f.read()
        kv_line_matches \
            = kv_line_matcher.finditer(csv_text)
        for match in kv_line_matches:
            lemma = match[1].lower()
            count = float(match[2])
            total += count
            tally[lemma] = count
    return {lemma: count / total
            for lemma, count
            in tally.items()
            if count > 1}

In [243]:
# idea was to compare either KL divergence or cross-entropy
# summand to standard English to see which words we use
# differently from the general English corpus;
# overall I think it more reflects the differences in chat
# English from regular English than anything about 
# this particular chat
def get_KL_divergences():
    english_freqs = get_English_frequencies()
    KL_divs = dict()
    for lemma, lemma_freq \
            in get_lemma_frequencies().items():
        try:
            normal_freq = english_freqs[lemma]
        except KeyError:
            continue
        KL_divs[lemma] = (lemma_freq 
                          * (np.log(lemma_freq)
                             - np.log(normal_freq)))
    return KL_divs

In [269]:
def get_corpus_by_sender(high_unicode=False):
    corpus_by_sender = defaultdict(list)
    for sender, msg in load_messages(high_unicode):
        corpus_by_sender[sender].append(msg)
    return corpus_by_sender

In [270]:
def get_total_characters(high_unicode=False):
    return {name: sum(map(len, messages))
            for name, messages
            in get_corpus_by_sender(high_unicode).items()}