## Imports

In [10]:
import json
from datetime import datetime
import random
import pandas as pd
from pprint import pprint

## Load Telegram Message History

In [12]:
#https://telegram.org/blog/export-and-more, select JSON

with open('result.json', encoding="utf8") as json_file:
    data = json.load(json_file)

messages = data["messages"] #messages is a list
print(len(messages)) #47902 messages


47902


## Segmenting "conversations"

In [13]:
new_convo_threshold_seconds = 3600 #a new conversation starts if 1 hr without further messages elapses after the last message from me
telegram_name = "Bing Wen"

#obtain date of messages
for message in messages:
    message["datetime"] = datetime.strptime(message['date'], '%Y-%m-%dT%H:%M:%S')

def check_new_convo(previous_message, current_message):
    return (previous_message["from"] == telegram_name and 
            (current_message["datetime"] - previous_message["datetime"]).seconds > 
            new_convo_threshold_seconds)

#this loop creates a list of conversations
conversations = []
for idx,message in enumerate(messages):
    if (idx == 0) or check_new_convo(messages[idx-1],messages[idx]):
        if idx > 0:
            conversations.append(new_conversation)
        new_conversation = []
    new_conversation.append(message)
    
conversations = list(filter(lambda x: len(x) > 1, conversations)) #one-message conversations are not conversations
print(len(conversations))

300


## Labelling messages within conversation

In [8]:
for conversation in conversations:
    for idx,message in enumerate(conversation):
        if idx == (len(conversation)-1):
            continue
        if idx == 0: #if start of convo
            message["prompt_start"] = True #this is a start of a prompt
            message["completion_start"] = False
        if message["from"] != telegram_name and conversation[idx+1]["from"] == telegram_name: #if this is the end of the other party's messages
            message["prompt_end"] = True #it's end the of the prompt
            conversation[idx+1]["completion_start"] = True #and the start of a completion
        else:
            message["prompt_end"] = False
            conversation[idx+1]["completion_start"] = False
        if message["from"] == telegram_name and conversation[idx+1]["from"] != telegram_name: #if this is the end of a string of my messages
            message["completion_end"] = True #it's the end of a completion
            conversation[idx+1]["prompt_start"] = True #and the next line is a start of a new prompt
        else:
            message["completion_end"] = False
            conversation[idx+1]["prompt_start"] = False

## Creating prompt-completion pairs

In [9]:
training_pairs = []

def get_line(message): #this function prepends 
    if message["from"] == telegram_name:
        name = "Me"
    else:
        name = "They"
    if 'photo' in message: #handle image messages
        text = '<IMAGE>'
    else:
        text = message["text"]
    if text:
        try: #handling some weird situations where there are urls/entities in the text
            if isinstance(text, list):
                textStr = ""
                for stuff in text:
                    if isinstance(stuff, dict):
                        textStr += stuff["text"]
                    else:
                        textStr += stuff
                text = textStr
        except:
            print(text)
        return f"{name}:{text}\n"
    else:
        return False
    
#this loop creates the multiple training examples from each example 
for conversation in conversations:
    seed_pair = {"prompt": "", "completion":""}
    for message in conversation:
        if message["prompt_start"]:
            key = "prompt"
        elif message["completion_start"]:
            key = "completion"
        new_line = get_line(message)
        if new_line:
            seed_pair[key] += get_line(message)
        if message.get("completion_end",True):
            training_pairs.append(seed_pair.copy())
            seed_pair["prompt"] += seed_pair["completion"]
            seed_pair["completion"] = ""

#strip those pairs with no completions
training_pairs = [pair for pair in training_pairs if len((pair["completion"].rstrip())) > 0]

print(f"{len(training_pairs)} training pairs") #9865 training pairs

9865 training pairs


## Postprocessing

In [13]:
stop_sequence = "<END>"
me_token = "Me:"
acceptable_char_length = 1400
min_prompt_length = 1400

def truncate_prompt(prompt, completion):
    if (len(prompt) + len(completion)) > acceptable_char_length:
        length_for_prompt = max(acceptable_char_length - len(completion), min_prompt_length)
        new_prompt = prompt[-length_for_prompt:]
        lower = min(new_prompt.find("\nMe:"),new_prompt.find("\nThey:"))
        new_prompt = new_prompt[lower+1:]
        return new_prompt
    else:
        return prompt
    
char_counter = 0

for pair in training_pairs:
    # next two lines gets rid of the first me in the completion, and appends it to the prompt instead
    pair['prompt'] += me_token
    pair['completion'] = " "+me_token.join(pair['completion'].split(me_token)[1:])+stop_sequence
    if len(pair['prompt']) + len(pair['completion']) > acceptable_char_length:
        pair['prompt'] = truncate_prompt(pair['prompt'],pair['completion']) #truncates prompt if conversation too long, retaining the more recent messages
    char_counter += (len(pair['prompt']) + len(pair['completion']))
        
        

## Prep training file

In [15]:
df = pd.DataFrame(training_pairs)
df.to_json("fine_tuning.jsonl", orient='records', lines=True)
!openai tools fine_tunes.prepare_data -f fine_tuning.jsonl -q

Analyzing...

- Your file contains 9865 prompt-completion pairs
- All prompts end with suffix `Me:`
- All completions end with suffix `\n<END>`

No remediations found.

You can use your file for fine-tuning:
> openai api fine_tunes.create -t "fine_tuning.jsonl"

After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `Me:` for the model to start generating completions, rather than continuing with the prompt. Make sure to include `stop=["\n<END>"]` so that the generated texts ends at the expected place.
Once your model starts training, it'll approximately take 2.3 hours to train a `curie` model, and less for `ada` and `babbage`. Queue will approximately take half an hour per job ahead of you.


## Fine-tuning

In [34]:
import os
os.environ["OPENAI_API_KEY"] = "<YOUR KEY>"
!openai api fine_tunes.create -t "fine_tuning.jsonl"

## Testing

In [16]:
import os
import openai
openai.organization = "<YOUR ORGANISATION KEY>"
openai.api_key = "<YOUR API KEY>"

def get_chatbings_response(text):
    prompt = "They:" + text + "\nMe:"
    stop_sequence = "<END>"
    response = openai.Completion.create(
        model="curie:ft-central-provident-fund-board-2023-01-23-07-14-12", 
        prompt=prompt, 
        temperature=0.2, 
        max_tokens=100, 
        frequency_penalty=0.6,
        presence_penalty=0.6,
        stop = stop_sequence
    )
    return response.choices[0].text

print(get_chatbings_response("What is the meaning of life?"))

 HAHAHAHA
Me:I'm not sure if I have a definitive answer to that
Me:But I think it's important to live life with purpose and meaning

