<a href="https://colab.research.google.com/github/pszemraj/ai-msgbot/blob/main/colab-notebooks/convert_whatsapp_export_to_GPT2_script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Converting Whatsapp Exports to GPT-2 Training 

- explores / illustrates how to convert a text file that is exported from whatsapp messages to the general `script` format used by `aitextgen` to train a GPT-2 chatbot model
- note that all names, etc have been replaced in the example text file used here. 


In [None]:
## formatting

from IPython.display import HTML, display
# colab formatting
def set_css():
    display(
        HTML(
            """
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  """
        )
    )

get_ipython().events.register("pre_run_cell", set_css)

In [None]:
dl_link = "https://www.dropbox.com/s/loe823gu7pdra2i/_chat.txt?dl=1" #@param {type:"string"}


In [None]:
import os 

vm_wd = os.getcwd()

In [None]:
from urllib import request
from os.path import join

local_name = join(vm_wd, "test-text-file.txt")
request.urlretrieve(dl_link, local_name)


('/content/test-text-file.txt', <http.client.HTTPMessage at 0x7f8459779fd0>)

In [None]:
with open(local_name, 'r', encoding='utf-8', errors='ignore') as f:
    textlines = f.readlines()

print(len(textlines), type(textlines))

624 <class 'list'>


clean the text

In [None]:
%%capture
!pip install -U clean-text
from cleantext import clean

In [None]:
textlines = [clean(line) for line in textlines]

In [None]:
import pprint as pp

pp.pprint(textlines[:10])


['[25.02.21, 23:57:58] alexander das great: hi kids',
 '[26.02.21, 06:54:14] jorge: hello friends',
 '[26.02.21, 08:13:32] olga: hi',
 '[26.02.21, 14:18:31] alexander das great: image omitted',
 '[26.02.21, 14:18:33] alexander das great: ayyy',
 '[26.02.21, 14:18:51] alexander das great: when is the first thing due? maybe '
 'i missed it but not announced right',
 "[26.02.21, 14:23:47] jorge: task 0 opens monday. i don't think it's for "
 'marks tho. task 1a opens monday the week after',
 '[26.02.21, 14:23:55] jorge: looks like 2 weeks to get it done',
 '[26.02.21, 14:34:05] olga: i am following the recordings btw, not the live '
 'session, because of double sheduling. i will be at the q&a sessions though',
 '[26.02.21, 14:34:45] jorge: also not attending tutorials due to double '
 "scheduling. i'm sure it'll be fine"]


In [None]:
import re

re_string = "\[([0-9]+(\.[0-9]+)+), ([0-9]+(:[0-9]+)+)\] "

# re.sub(pattern, repl, string, count=0, flags=0)

sub_textlines = [re.sub(re_string, "", line) for line in textlines]


pp.pprint(sub_textlines[:10])


['alexander das great: hi kids',
 'jorge: hello friends',
 'olga: hi',
 'alexander das great: image omitted',
 'alexander das great: ayyy',
 'alexander das great: when is the first thing due? maybe i missed it but not '
 'announced right',
 "jorge: task 0 opens monday. i don't think it's for marks tho. task 1a opens "
 'monday the week after',
 'jorge: looks like 2 weeks to get it done',
 'olga: i am following the recordings btw, not the live session, because of '
 'double sheduling. i will be at the q&a sessions though',
 "jorge: also not attending tutorials due to double scheduling. i'm sure it'll "
 'be fine']


In [None]:
fin_text = []

for line in sub_textlines:
    line = str(line)
    parts = line.split(": ")
    if len(parts) == 2 and isinstance(parts, list):
        fin_text.append(parts[0] + ":\n")
        fin_text.append(parts[1] + "\n")
        fin_text.append("\n")
    elif len(parts) > 2:
        fin_text.append(parts[0] + ":\n")
        fin_text.append(" ".join(parts[1:]) + "\n")
        fin_text.append("\n")
    else:
        continue



pp.pprint(fin_text[:20])

['alexander das great:\n',
 'hi kids\n',
 '\n',
 'jorge:\n',
 'hello friends\n',
 '\n',
 'olga:\n',
 'hi\n',
 '\n',
 'alexander das great:\n',
 'image omitted\n',
 '\n',
 'alexander das great:\n',
 'ayyy\n',
 '\n',
 'alexander das great:\n',
 'when is the first thing due? maybe i missed it but not announced right\n',
 '\n',
 'jorge:\n',
 "task 0 opens monday. i don't think it's for marks tho. task 1a opens monday "
 'the week after\n']


# save


In [None]:
outname = "reformatted_text.txt"
with open(outname, 'w', encoding='utf-8', errors='ignore') as fo:

    fo.writelines(fin_text)

from google.colab import files

files.download(outname)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>