In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle

%matplotlib inline

In [3]:
import csv

with open('datasets/train.tsv') as f:
    rows_train = [line[:-1].split('\t') for line in f] 
    
with open('datasets/public.tsv') as f:
    rows_public = [line[:-1].split('\t') for line in f] 

In [4]:
with open('datasets/en-ru.txt/OpenSubtitles2016.en-ru.ru') as f:
    base = [line[:-1].split('\t') for line in f] 

In [4]:
rows_train[200:250]

[['621441536517',
  '',
  '',
  'я с ним не разговаривал .',
  '4',
  'не виделся ну , раздевайтесь .',
  'bad',
  '0.96996169'],
 ['621441536517',
  '',
  '',
  'я с ним не разговаривал .',
  '5',
  'совсем ?',
  'good',
  '0.9682806455'],
 ['630918374463',
  '',
  'двое .',
  'о . вау !',
  '0',
  'что вау ?',
  'good',
  '0.6852331573'],
 ['630918374463',
  '',
  'двое .',
  'о . вау !',
  '1',
  'скажи ?',
  'neutral',
  '0.6339007578'],
 ['630918374463',
  '',
  'двое .',
  'о . вау !',
  '2',
  'чувствуешь себя униженным ? да . меня обогнал хороший человек .',
  'bad',
  '0.8809825665'],
 ['630918374463',
  '',
  'двое .',
  'о . вау !',
  '3',
  'удивлена ?',
  'good',
  '0.6959357362'],
 ['630918374463',
  '',
  'двое .',
  'о . вау !',
  '4',
  'это банально , говорить " вау " ?',
  'good',
  '0.8890238226'],
 ['630918374463',
  '',
  'двое .',
  'о . вау !',
  '5',
  'что " ого " ?',
  'bad',
  '0.6807389284'],
 ['643528390848',
  'мой пиджак еще там .',
  'оставь его .',
  '

In [4]:
import enchant
d = enchant.Dict("en_US")

def check_en(rows, maxind=2):
    global d
    for r in tqdm(rows, position=0):
        for k in r[:len(r)-maxind]:
            if k != '' and d.check(k) and not k.isnumeric():
                print(r, k)
                return
            
check_en(rows_train)
check_en(rows_public, 0)

100%|██████████| 97533/97533 [00:16<00:00, 6060.16it/s]
100%|██████████| 9968/9968 [00:01<00:00, 6243.17it/s]


видно, что нет реплик на английском.

In [6]:
def get_stats(rows):
    lens = set()
    for i,r in enumerate(rows):
        if len(r) not in lens:
            lens.add(len(r))
            print(i, len(r), r)
            
    summ = 0
    maxx = 0
    for i, r in enumerate(rows):
        summ += (len(r[1].split()) + len(r[2].split()) + len(r[3].split()) + len(r[5].split()))
        maxx = np.max([maxx] + [len(r[t].split()) for t in [1, 2, 3, 5]])

    print('mean token len: {}'.format(summ / len(rows)))
    print('max token replic {}'.format(maxx))
    

get_stats(rows_train)
print('**************')
get_stats(rows_public)

0 8 ['22579918886', 'кликни на меня а потом на надпись " видео - звонок " .', 'о , я тебя вижу .', 'ладно , повесь трубку .', '0', 'не могу .', 'good', '0.8753516175']
mean token len: 20.346170014251587
max token replic 65
**************
0 6 ['138920940977', 'знаешь , я иногда подумываю , что тебе надо принести сюда свою гитару и показать местным настоящую игру .', 'не - а .', 'нет ?', '0', 'неа .']
mean token len: 20.65730337078652
max token replic 37


в каждой строке одинаковое число \t, пустые реплики - ''. Возьмем максимум 30 токенов на реплику.

In [24]:
train_df = pd.DataFrame(rows_train)
public_df = pd.DataFrame(rows_public)

In [5]:
train_df = pd.read_csv('datasets/train_df', index_col='Unnamed: 0')

In [8]:
from collections import Counter

In [3]:
train_df[np.isnan(train_df['0'])]

Unnamed: 0,0,1,2,3,4,5,6,7


In [5]:
Counter(train_df[6])

Counter({'bad': 34770, 'good': 51509, 'neutral': 11254})

для моделей первого уровня лучше выбрать классы bad + neutral и good

In [6]:
train_df[50:51].values

array([[289676265247, nan, nan, 'мое имя дали мне родители .', 1,
        'полное имя ?', 'bad', 0.9387921268000001]], dtype=object)

In [10]:
public_df.head()

Unnamed: 0,0,1,2,3,4,5
0,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,0,неа .
1,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,1,"нет , не хочу ."
2,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,2,нет .
3,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,3,"конечно , нет ."
4,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,4,"разумеется , нет ."


In [104]:
train_df.to_csv('datasets/train_df')
public_df.to_csv('datasets/public_df')

In [5]:
import regex

from nltk.stem.snowball import RussianStemmer

In [6]:
def sent2vec(sent, emb, max_len=30, emb_size=300):
    vec = np.zeros((max_len, emb_size))
    for i, t in enumerate(sent[:min(max_len, len(sent))]):
        if t in emb:
            vec[i] = emb[t]
    return vec

def fix_text(text):
    text = regex.sub("(?s)<ref>.+?</ref>", "", text) # remove reference links
    text = regex.sub("(?s)<[^>]+>", "", text) # remove html tags
    text = regex.sub("&[a-z]+;", "", text) # remove html entities
    text = regex.sub("(?s){{.+?}}", "", text) # remove markup tags
    text = regex.sub("(?s){.+?}", "", text) # remove markup tags
    text = regex.sub("(?s)\[\[([^]]+\|)", "", text) # remove link target strings
    text = regex.sub("(?s)\[\[([^]]+\:.+?]])", "", text) # remove media links
    text = regex.sub("[']{5}", "", text) # remove italic+bold symbols
    text = regex.sub("[']{3}", "", text) # remove bold symbols
    text = regex.sub("[']{2}", "", text) # remove italic symbols
    text = regex.sub(u"[^ \r\n\p{Cyrillic}.?!\-]", " ", text)
    text = text.lower()
    return text

def tokenize_word(word, stemmer):
    stem =  stemmer.stem(word)
    affix = word[len(stem):]

    if affix:
        return (stem, '#' + affix)  #To make embeddings work, all suffixes start with artificial token '#'
    else:
        return (stem, )

def clear_sent(sent, stemmer=RussianStemmer):
    sent = fix_text(sent)
    phrases = regex.split("([.?!])?[\n]+|[.?!] ", sent)
    words = [s.split() for s in phrases if s is not None]
    
    sent = []
    for s in words:
        sent.append('<s>')
        for w in s:
            tokens = tokenize_word(w, stemmer)
            for t in tokens:
                sent.append(t)
        sent.append('<\s>')
    
    return sent
    
    

def context2vec(context, emb, max_len=30, emb_size=300, stemmer=RussianStemmer()):
    assert len(context) == 4
    vecs = None
    for sent in context:
        sent = clear_sent(sent, stemmer)
        vec = sent2vec(sent, emb, max_len, emb_size)
        if vecs is None:
            vecs = vec
        else:
            vecs = np.concatenate([vecs, vec], axis = 0)
    
    return vecs

In [7]:
ru_emb_csv = pd.read_csv('datasets/ru.csv', header=None).drop([301], axis=1)
ru_emb_csv.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,#а,-4.1014,-1.56263,-1.76345,1.29674,-1.60807,0.382005,1.56409,-2.60826,0.735975,...,0.53453,-0.579029,-0.843691,-0.962004,-2.1295,-1.0109,-0.653856,0.551031,-0.419016,2.9404
1,<s>,0.63146,-0.067456,-0.040757,-0.367607,-1.76219,0.096299,1.67382,-2.93316,0.443866,...,-0.11162,-2.48217,-0.099588,-0.834389,-0.428096,-1.99231,0.171781,2.47704,0.640765,-0.87263
2,<\s>,0.502858,0.02661,2.07638,0.728407,-0.898755,-0.618859,-0.316382,-2.29943,2.86637,...,-0.111344,1.09551,2.00433,-2.39825,0.218152,-3.5713,-0.339903,1.64919,0.634857,1.66519
3,в,-2.1532,1.36208,-0.337849,-0.202003,-0.766991,1.69276,-0.023651,0.876718,0.477964,...,-0.174846,2.55248,2.38899,2.42757,1.06091,-2.61281,1.36328,1.06837,-0.15369,1.78664
4,#е,0.594356,-1.22215,0.95723,4.08471,-4.11514,1.01676,5.92772,-4.74943,-3.38718,...,1.94373,-2.96099,-2.34049,1.33002,-1.15684,-0.837078,0.508406,3.58735,3.11674,-0.479126


In [8]:
ru_emb_csv.shape

(50053, 301)

In [9]:
ru_emb = {str(line[0]): np.array(line[1:]) for line in ru_emb_csv.values}

In [67]:
train_df[[1,2,3,5]][109:110].values[0]

array(['!',
       'ты единственный свидетель , и убийцы не знают , что ты видела .',
       'я смогу тебя защитить .', 'защитить меня ?'], dtype=object)

In [119]:
v = context2vec(['Я пришел домой. Перед этим был на работе.', '', '', ''], ru_emb)

In [120]:
v.shape

(120, 300)

In [30]:
def df2vec(df, emb):
    res = []
    for i in tqdm(range(df.shape[0]), position=0):
        res.append(context2vec(df[[1,2,3,5]][i:i+1].values[0], emb, max_len=40))
    return np.array(res)

In [12]:
train_df[[1,2,3,5]].iloc[0].values

array(['кликни на меня а потом на надпись " видео - звонок " .',
       'о , я тебя вижу .', 'ладно , повесь трубку .', 'не могу .'],
      dtype=object)

In [11]:
from sklearn.model_selection import train_test_split

In [13]:
def shuffle_by_groups(df, col, random_state=None):
    groups = [df for _, df in df.groupby(col)]
    np.random.seed(random_state)
    np.random.shuffle(groups)
    return pd.concat(groups).reset_index(drop=True)

In [15]:
stack_sents, test_sents = \
    train_test_split(shuffle_by_groups(train_df, '0', 42), test_size=0.05, shuffle=False)
    
train_val_sents, boost_sents = \
    train_test_split(shuffle_by_groups(stack_sents, '0', 13), test_size=0.15, shuffle=False)

train_sents, val_sents = \
    train_test_split(shuffle_by_groups(train_val_sents, '0', 25), test_size=0.1, shuffle=False)

In [16]:
train_sents.shape[0], val_sents.shape[0], boost_sents.shape[0], test_sents.shape[0]

(70881, 7876, 13899, 4877)

In [18]:
train_sents.to_csv('datasets/sents/train_sents', index=False)
val_sents.to_csv('datasets/sents/val_sents', index=False)
test_sents.to_csv('datasets/sents/test_sents', index=False)
boost_sents.to_csv('datasets/sents/boost_sents', index=False)

In [32]:
# val_vecs = df2vec(val_sents, ru_emb)
# np.save('datasets/vecs/val_vecs', val_vecs)
# test_vecs = df2vec(test_sents, ru_emb)
# np.save('datasets/vecs/test_vecs', test_vecs)
# boost_vecs = df2vec(boost_sents, ru_emb)
# np.save('datasets/vecs/boost_vecs', boost_vecs)
# train_vecs = df2vec(train_sents, ru_emb)
# np.save('datasets/vecs/train_vecs', train_vecs)

100%|██████████| 7876/7876 [00:48<00:00, 161.11it/s]
100%|██████████| 4877/4877 [00:28<00:00, 172.77it/s]
100%|██████████| 13899/13899 [02:15<00:00, 102.21it/s]


## mapping base

In [10]:
from joblib import Parallel, delayed

In [None]:
# next_map = np.zeros((len(base) // 10, 2, 300))
max_len = 40

inds = list(enumerate(np.random.choice(range(len(base) - 4), 
                                                size=len(base) // 10)))

def f(t, i):
    vec = context2vec([x[0] for x in base[i:i+4]], ru_emb)
    context = np.mean(vec[:3 * max_len], axis=0).reshape(-1)
    reply = np.mean(vec[3 * max_len:], axis=0).reshape(-1)
    return np.array([context, reply])

with Parallel(n_jobs=-1) as parallel:
    next_map = np.array(parallel(delayed(f)(t, i) for t, i in tqdm(inds,  position=0)))


# for t,i in tqdm(inds,  position=0):
#     vec = context2vec([x[0] for x in base[i:i+4]], ru_emb)
#     context = np.mean(vec[:3 * max_len], axis=0).reshape(-1)
#     reply = np.mean(vec[3 * max_len:], axis=0).reshape(-1)
#     next_map[t][0] = context
#     next_map[t][1] = reply

np.save('datasets/base_map_big', next_map)
    
# with open('base_map_big', 'wb') as f:
#     pickle.dump([next_map], f, -1)

  out=out, **kwargs)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  out=out, **kwargs)
  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  1%|          | 13540/1856248 [00:26<1:00:20, 508.96it/s]

In [19]:
with open('base_map', 'rb') as f:
    next_map = pickle.load(f)

  1%|          | 16233/1856248 [01:30<2:50:44, 179.61it/s]

In [18]:
subtitle_emb_space = np.array([np.array(k) for k in next_map.keys()])

In [20]:
np.save('subtitle_emb_space',subtitle_emb_space)