In [16]:
import torch
from torch.utils.data import DataLoader, Dataset
import nltk
from tqdm import tqdm
import re
from pathlib import Path

In [15]:
line = "10/20/19, 14:50 - Robin: Gleich erstmal bei dem Wetter zum Bus laufen 😂"
match = re.findall(r".+? - (.+?): (.+)", line)
match

[('Robin', 'Gleich erstmal bei dem Wetter zum Bus laufen 😂')]

In [46]:
class ChatDataset(Dataset):
    def __init__(self, directory = "chats"):
        self.data = self.get_data(directory)
    
    def get_data(self, directory: str):
        sequences = []
        for filename in tqdm(Path(directory).glob("*.txt")):
            sequences.append(self.get_sequences(filename))
        return sequences


    def get_sequences(self, filename):
        f = open(filename, "r", encoding = "utf-8")
        sequences = {}

        for i, line in enumerate(tqdm(f)):
            # matches the author and the message
            match = re.findall(r".+? - (.+?): (.+)", line)
            if not match:
                continue      
            # get author and message
            author, msg = match[0]
            author = "Other" if author != "Robin" else "Robin"
            # skip media files
            if msg in ["<Media omitted>", "Missed voice call", "Missed video call"]:
                continue
            # get tokens
            sequence = nltk.word_tokenize(msg)
            # add entry for author in dictionary
            if not author in sequences:
                sequences[author] = {}
            # add message to dict
            sequences[author][i] = sequence

        return sequences


    def __len__(self):
        pass

    def __getitem__(self):
        pass

In [47]:
dataset = ChatDataset()

0it [00:00, ?it/s]
0it [00:00, ?it/s][A
1165it [00:00, 11639.44it/s][A
2329it [00:00, 11173.46it/s][A
3448it [00:00, 10980.12it/s][A
4547it [00:00, 10854.59it/s][A
5635it [00:00, 10860.08it/s][A
6722it [00:00, 10646.19it/s][A
7788it [00:00, 10513.24it/s][A
8855it [00:00, 10558.91it/s][A
9971it [00:00, 10740.46it/s][A
11082it [00:01, 10850.12it/s][A
12168it [00:01, 10850.09it/s][A
13254it [00:01, 10817.41it/s][A
14351it [00:01, 10860.33it/s][A
15460it [00:01, 10926.39it/s][A
16553it [00:01, 10826.87it/s][A
17636it [00:01, 10824.87it/s][A
18765it [00:01, 10960.85it/s][A
19870it [00:01, 10984.51it/s][A
20999it [00:01, 11072.92it/s][A
22107it [00:02, 11038.95it/s][A
23212it [00:02, 10908.72it/s][A
24534it [00:02, 10908.62it/s]
1it [00:02,  2.25s/it]


In [48]:
dataset.data[0]["Other"]

{2: ['Oh', 'man'],
 8: ['Wie', 'läufts'],
 11: ['Morgen', 'oder', 'Dienstag'],
 14: ['Meine', 'Eltern', 'kommen', 'gegen', '2'],
 16: ['Si'],
 23: ['Da', 'wurden', 'ak', 'und', 'm', 'auch', 'mitkommen'],
 27: ['Ja'],
 29: ['Robin'],
 32: ['Meine', 'Eltern', 'haben', 'dir', 'was', 'mitgebracht'],
 33: ['Nimm', 'dir', 'doch', 'vom', 'personalraum', 'einen', 'Schirm', 'mit'],
 42: ['Besoffen', 'spürt', 'man', 'keine', 'Kälte'],
 43: ['Mein', 'Schritt', 'ist', 'komplett', 'nass'],
 46: ['Also', 'bist', 'du', 'ehr', 'dagegen', '?'],
 47: ['Ich', 'hab', 'ein', 'Bild', 'von', 'dir', 'gesehen'],
 52: ['Ja',
  'das',
  'hatte',
  'ich',
  'heute',
  'eh',
  'vor',
  'damit',
  'heute',
  'nicht',
  'nur',
  'dein',
  'Penis',
  'in',
  'mich',
  'rein',
  'darf'],
 58: ['Ich', 'bin', 'sauer'],
 61: ['Ja', '.'],
 65: ['Ja'],
 66: ['Netto', 'ist', 'wieder', 'besonders', 'schnell'],
 68: ['Ja'],
 70: ['Ja', 'bin', 'in', 'der', 'Bahn'],
 72: ['Bahn', 'fahren', 'macht', 'mich', 'immer', 'müde'],
 76