## Imports

In [23]:
import pandas as pd
import re
import numpy as np
import json

def merge_utts(utts):
  merged = []

  last_speaker = int(utts[0][0])
  cur_utt = utts[0]

  for utt in utts[1:]:
    speaker = int(utt[0])
    if speaker != last_speaker:
      merged.append(cur_utt)
      cur_utt = utt
    else:
      cur_utt += f"\n{utt[2:].strip().capitalize()}"

    last_speaker = speaker

  if merged[-1] != cur_utt:
    merged.append(cur_utt)

  return merged

def clean_dialogue(dialogue):
  cleaned = re.sub(r"<[^>]*>", ' ', dialogue)
  cleaned = [i.strip() for i in cleaned.split("–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å") if len(i.strip())>0]

  if not cleaned[0][0].isalnum():
    cleaned.pop(0)
    if not cleaned[-1][-1].isalnum():
      cleaned[-1] = cleaned[-1][:-2]

  return cleaned

def clean_persona(persona):
  persona = persona.split("<br />")
  persona = [re.sub(r"<[^>]*>", ' ', trait).strip() for trait in persona]
  persona = [i.strip() for i in persona if len(i.strip())>0]
  return persona

def preprocess(ds, bot_prefix="–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 2",user_prefix="–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 1"):
  fid_dialogue = []

  for i in range(ds.shape[0]):
    entry = ds.iloc[i]
    persona1 = entry['persona_1_profile']
    persona2 = entry['persona_2_profile']
    personas = [persona1,persona2]

    dialogue = entry['dialogue']

    cleaned = clean_dialogue(dialogue)

    merged = merge_utts(cleaned)
    merged.insert(0,"")

    personas_order = [int(merged[1][0]),int(merged[2][0])]

    for i_per in range(len(personas)):
      persona_num = personas_order[i_per]
      persona = personas[persona_num-1]

      bot_personality = clean_persona(persona)

      dialogue_history = []

      for j in range(i_per,len(merged)-1,2):
        dialogue_entry = {"id":i}

        question = f"{user_prefix}:{merged[j][2:]}"
        answer = f"{bot_prefix}:{merged[j+1][2:]}"

        dialogue_entry["question"] = question
        dialogue_entry["target"] = answer
        dialogue_entry["answers"] = [answer]

        history_string = "\n".join(dialogue_history)

        dialogue_entry["ctxs"] = [{"title":trait,"text":history_string} for trait in bot_personality]

        fid_dialogue.append(dialogue_entry)

        dialogue_history.extend([question,answer])

  return fid_dialogue

## Load data

In [None]:
!unzip TlkPersonaChatRus.zip -d ./

In [None]:
ru_dialogues = pd.read_csv('./TlkPersonaChatRus/dialogues.tsv', sep='\t')

In [None]:
ru_dialogues.head()

Unnamed: 0,persona_1_profile,persona_2_profile,dialogue
0,<span class=participant_1>–£ –º–µ–Ω—è –ª—é–±–∏–º–∞—è —Ä–∞–±–æ—Ç...,<span class=participant_2>–ò—â—É –ø—Ä–∏–Ω—Ü–∞.<br />–í–µ–¥...,<span class=participant_2>–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 2: –ü—Ä–∏–≤...
1,<span class=participant_1>–Ø —Ä–∞–±–æ—Ç–∞—é —É—á–∏—Ç–µ–ª–µ–º<b...,<span class=participant_2>–Ø –±–∏–∑–Ω–µ—Å–º–µ–Ω<br />–£ –º...,<span class=participant_1>–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 1: –ü—Ä–∏–≤...
2,<span class=participant_1>–Ø –∫—É–ø–∏–ª–∞ –¥–æ–º<br />–Ø ...,<span class=participant_2>–Ø –ø–æ—é –≤ –∫–∞—Ä–∞–æ–∫–µ<br /...,<span class=participant_1>–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 1: –ü—Ä–∏–≤...
3,<span class=participant_1>—è –≤—Ä–∞—á –∏ –∂–µ–Ω–∞—Ç<br />...,<span class=participant_2>–Ø –º–∞–ª—å—á–∏–∫<br />–Ø —É—á—É...,<span class=participant_2>–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 2: –ó–¥—Ä–∞...
4,<span class=participant_1>–Ø —à–∫–æ–ª—å–Ω–∏—Ü–∞.<br />–Ø ...,<span class=participant_2>–Ø –ø—Ä–æ—Å—Ç–æ–≤–∞—Ç.<br />–õ—é...,<span class=participant_1>–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å 1: –ü—Ä–∏–≤...


In [None]:
ru_dialogues.shape

(10013, 3)

In [None]:
# Devide data frame into the train, dev, test. Dev and test are of equal size

train_size = 0.8

df = ru_dialogues.copy()

ru_dialogues_train = df.sample(frac = train_size,random_state=42)
df = df.drop(ru_dialogues_train.index)
ru_dialogues_dev = df.sample(frac = 0.5,random_state=42)
ru_dialogues_test = df.drop(ru_dialogues_dev.index)

In [None]:
print("train",ru_dialogues_train.shape)
print("dev",ru_dialogues_dev.shape)
print("test",ru_dialogues_test.shape)

train (8010, 3)
dev (1002, 3)
test (1001, 3)


In [14]:
for i in range(10):
  entry = ru_dialogues.iloc[i]
  persona1 = clean_persona(entry['persona_1_profile'])
  persona2 = clean_persona(entry['persona_2_profile'])

  dialogue = entry['dialogue']
  cleaned = clean_dialogue(dialogue)

  merged = merge_utts(cleaned)

  print("Persona 1:",persona1)
  print("Persona 2:",persona2)

  print("\nDialogue:")
  print(*merged,sep='\n')
  print("\n\n")

Persona 1: ['–£ –º–µ–Ω—è –ª—é–±–∏–º–∞—è —Ä–∞–±–æ—Ç–∞.', '–Ø —É–≤–∞–∂–∞—é –ª—é–¥–µ–π.', '–£ –º–µ–Ω—è –µ—Å—Ç—å –∂–∏–≤–æ—Ç–Ω–æ–µ.', '–£ –º–µ–Ω—è —Ö–æ—Ä–æ—à–∏–π –¥—Ä—É–≥.', '–Ø –ª—é–±–ª—é –∫–æ—Ñ–µ.']
Persona 2: ['–ò—â—É –ø—Ä–∏–Ω—Ü–∞.', '–í–µ–¥—É –∞–∫—Ç–∏–≤–Ω—ã–π –æ–±—Ä–∞–∑ –∂–∏–∑–Ω–∏.', '–õ—é–±–ª—é —á–∏—Ç–∞—Ç—å –∫–ª–∞—Å—Å–∏–∫—É.', '–í—ã—Ä–∞—â–∏–≤–∞—é —Ñ–∏–∞–ª–∫–∏.', '–õ—é–±–ª—é –æ–±—â–µ–Ω–∏–µ.']

Dialogue:
2: –ü—Ä–∏–≤–µ—Ç) —Ä–∞—Å—Å–∫–∞–∂–∏ –æ —Å–µ–±–µ
1: –ü—Ä–∏–≤–µ—Ç) –ø–æ–¥ –≤–∫—É—Å–Ω—ã–π –∫–æ—Ñ–µ–µ–∫ –Ω–∞—Å—Ç—Ä–æ–µ–Ω–∏–µ –ø–æ–±–æ–ª—Ç–∞—Ç—å –ø–æ—è–≤–∏–ª–æ—Å—å )
2: –ß—Ç–æ —á–∏—Ç–∞–µ—à—å? –ú–Ω–µ –Ω—Ä–∞–≤–∏—Ç—Å—è –∫–ª–∞—Å—Å–∏–∫–∞
–Ø —Ç–æ–∂–µ –ª—é–±–ª—é –ø–æ–æ–±—â–∞—Ç—å—Å—è
1: –õ—é–±–ª—é –∂–∏–≤–æ—Ç–Ω—ã—Ö, –ø—Ä–æ—Å—Ç–æ –æ–±–æ–∂–∞—é, –∫–∞–∫ –∏ —Å–≤–æ—é —Ä–∞–±–æ—Ç—É)
–Ø —Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫—É –ª—é–±–ª—é
2: –ê —è –≤—ã—Ä–∞—â–∏–≤–∞—é —Ñ–∏–∞–ª–∫–∏
–ò –≤–µ–¥—É –∑–¥–æ—Ä–æ–≤—ã–π –∏ –∞–∫—Ç–∏–≤–Ω—ã–π –æ–±—Ä–∞–∑ –∂–∏–∑–Ω–∏!
1: –£—Ö —Ç—ã, –∏–Ω—Ç–µ—Ä–µ—Å–Ω–æ.
2: –¢—ã —Å–ª—É—á–∞–π–Ω–æ –Ω–µ –ø—Ä–∏–Ω—Ü –Ω–∞ –±–µ

## Process data

In [24]:
train = preprocess(ru_dialogues_train)

In [25]:
dev = preprocess(ru_dialogues_dev)

In [26]:
test = preprocess(ru_dialogues_test)

In [27]:
len(train)

134917

In [28]:
len(dev)

16970

In [29]:
len(test)

16896

## Save dataset

In [30]:
with open('train.json', 'w') as outfile:
    json.dump(train, outfile)

In [31]:
with open('dev.json', 'w') as outfile:
    json.dump(dev, outfile)

In [32]:
with open('test.json', 'w') as outfile:
    json.dump(test, outfile)