In [44]:
#help from https://codereview.stackexchange.com/questions/183668/parse-complex-text-files-using-python

import re
import pandas as pd
from collections import defaultdict, Counter

In [41]:
class _RegExLib:
    """Set up regular expressions"""
    # use https://regexper.com to visualise these if required
    _reg_player = re.compile('^.*:') #match players but not web links
    _reg_diceroll = re.compile('rolling .*d.*')

    def __init__(self, line):
        # check whether line has a positive match with all of the regular expressions
        self.player = self._reg_player.match(line)
        self.diceroll = self._reg_diceroll.match(line)

In [42]:
def parse(filepath):
    """
    Extract text file information into more usable form

    Parameters
    ----------
    filepath : str
        Filepath for file to be parsed

    Returns
    -------

    """

    roll_data = defaultdict(list)
    words = defaultdict(dict)
    
    with open(filepath, 'r') as file:
        line = file.readline()
        state = ""
        while line:
            reg_match = _RegExLib(line)

            if reg_match.player:
                #players are on their own line
                player = reg_match.player
                #print(player)

            elif reg_match.diceroll:
                #dice rolls often come up like (/n##/n)+##, where ## is a number
                #diceroll = reg_match.diceroll.group(0)
                #set state to show we are processing a die roll
                state = "ROLL"
            
            elif state=="ROLL":
                l = line.strip()
                if l.isdigit():
                    #a player roll. record.
                    roll_data[player.group(0)].append(l)
                elif ")" in l:
                    #we're at the end of the roll. end state.
                    state = ""
                    
            elif state != "ROLL" and not reg_match.player:
                for word in line.split():
                    if word in words[player]:
                        words[player][word] += 1
                    else:
                        words[player][word] = 1
            
            line = file.readline()

    return roll_data, words

In [48]:
filepath = 'IXChats_final.txt'
rolldata, worddata = parse(filepath)
#keep roll data in the dictionary-of-lists form so to allow for 
#chrono analysis of rolls
#print(rolldata)

defaultdict(<class 'list'>, {'JonnyBadger H. (GM):': ['12', '13', '2', '2', '1', '15', '19', '3', '11', '10', '17', '6', '3', '5', '3', '1', '3', '5'], 'face t.:': ['5'], 'Jason M.:': ['17', '17', '17', '8', '15', '15', '13', '14', '16', '15', '18', '13', '17', '19', '5', '13', '5', '20', '5', '1', '15', '12', '7', '14', '16', '16', '14', '1', '15', '7', '18', '14', '9', '14', '2', '8', '1', '3', '8', '7', '7', '6', '11', '9', '8', '11', '17', '15', '17', '11', '11', '5', '20', '8', '19', '12', '3', '13', '6', '14', '6', '3', '9', '13', '7', '17', '2', '15', '16', '10', '15', '19', '19', '2', '7', '7', '5', '14', '1', '9', '8', '17', '19', '19', '5', '17', '11', '17', '12', '10', '3', '3', '2', '6', '2', '2', '16', '7', '8', '11', '20', '10', '16', '11', '9', '8', '2', '1', '18', '1', '3', '17', '1', '16', '17', '13', '14', '10', '2', '18', '17', '6', '9', '17', '16', '5', '16', '10', '17', '13', '12', '7', '10', '8', '1', '5', '19', '1', '15', '14', '3', '1', '17', '18', '10', '16', '

In [47]:
rolldata_count = {}
for p in rolldata:    
    rolldata_count[p] = Counter(rolldata[p])
#print(rolldata_count)

{'JonnyBadger H. (GM):': Counter({'3': 4, '2': 2, '1': 2, '5': 2, '12': 1, '13': 1, '15': 1, '19': 1, '11': 1, '10': 1, '17': 1, '6': 1}), 'face t.:': Counter({'5': 1}), 'Jason M.:': Counter({'18': 62, '14': 59, '15': 58, '10': 58, '19': 54, '7': 54, '17': 51, '20': 51, '8': 50, '5': 50, '1': 50, '2': 50, '16': 49, '9': 46, '6': 46, '13': 45, '3': 44, '4': 43, '11': 42, '12': 41}), 'Mitch:': Counter({'2': 16, '3': 13, '1': 13, '6': 12, '16': 11, '5': 10, '15': 9, '7': 9, '12': 8, '4': 8, '19': 7, '8': 7, '13': 7, '11': 6, '14': 6, '10': 5, '18': 4, '20': 4, '17': 3, '45': 1, '82': 1, '33': 1, '65': 1, '21': 1, '99': 1, '9': 1}), 'Rob C.:': Counter({'4': 2, '3': 1, '6': 1, '1': 1}), 'Snickers:': Counter({'38': 1, '46': 1, '73': 1, '14': 1}), 'Merkatroid Skittle:': Counter({'8': 47, '4': 43, '2': 42, '9': 36, '5': 35, '6': 33, '19': 33, '3': 33, '14': 33, '1': 32, '7': 31, '16': 30, '18': 29, '13': 28, '20': 28, '11': 27, '17': 24, '12': 22, '15': 21, '10': 19, '46': 1, '73': 1, '47': 1}