In [15]:
import os
import pandas as pd
import numpy as np
import re
import string
import itertools
import unicodedata

from scipy.spatial.distance import cdist
#from scipy.spatial import distance
import numpy as np
from sklearn.preprocessing import MinMaxScaler

from datasets import load_dataset

import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline


from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
pd.set_option('display.max_rows', None)
pd.set_option("display.max_colwidth", 500)
pd.options.mode.chained_assignment = None
warnings.filterwarnings(action="ignore", message="Mean of empty slice")
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simami/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/simami/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/simami/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/simami/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/simami/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
def get_info_on_dataset(_df):
    CONST_IN_PERCENTAGES = float(100.0)
    try:
        display(_df.head())
        print("")
        _df.info()
        df_review_info = pd.DataFrame({'Not_Valid_Values_in %': (_df.isna().sum()/len(_df))*CONST_IN_PERCENTAGES} )
        print("")
        display(df_review_info.sort_values(by='Not_Valid_Values_in %', ascending=False))
        print("")
        print("Number of duplicated rows in the dataset: {}".format(_df.duplicated().sum()))
        print("")
        print("Number of unique values in the dataset:\n{}".format(_df.nunique()))
    except:
        print("Can't provide review on given dataset")

def adjust_total_stop_words_list(_stop_words_list):
    total_stop_words_sample = ' '.join(_stop_words_list)
    total_stop_words_sample_processed = unicodedata.normalize('NFKD',
                                                              total_stop_words_sample).encode('ascii','ignore').decode()
    total_stop_words_sample_processed = punctuation_and_case(total_stop_words_sample_processed)
    total_stop_words_list_processed = total_stop_words_sample_processed.split(" ")
    return total_stop_words_list_processed

def punctuation_and_case(text_sample):
    string_punctuation = string.punctuation.replace('-','') + "’"
    punctuation_map = str.maketrans(string_punctuation, ' '*len(string_punctuation))
    return text_sample.translate(punctuation_map)

In [18]:
STOP_WORDS_LANGUAGES_LIST_CONST = ['english'] 
total_stop_words_lists = [stopwords.words(_lng) for _lng in STOP_WORDS_LANGUAGES_LIST_CONST]
total_stop_words_list = list(itertools.chain(*total_stop_words_lists))
TOTAL_STOP_WORDS_LIST_PROCESSED_CONST = adjust_total_stop_words_list(total_stop_words_list)
TF_IDF_MIN_DF_VALUE_CONST = int(3)
TF_IDF_MAX_DF_VALUE_CONST = float(0.5)
MAX_LENGTH = 100
NUM_BEAMS = 3
EARLY_STOP = True

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
model_translate = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru")

model_translate_glaz = "glazzova/ml_translation_model1"
translator = pipeline("translation", model=model_translate_glaz)

In [20]:
def lowering(text_sample):
    processed_words_list = [w.lower() for w in text_sample.split()]
    return ' '.join(processed_words_list)

def stop_words(text_sample, _stop_words_ = TOTAL_STOP_WORDS_LIST_PROCESSED_CONST):
    text_sample_processed = [word for word in text_sample.split() if word not in _stop_words_]
    return ' '.join(text_sample_processed)

def strip_hashtags(text_sample):
    processed_words_list = []
    for word in text_sample.split(' '):
        word = word.strip()
        if word:
            if word.find('@') < 0 and word.find('#') < 0 :
                processed_words_list.append(word)
    return ' '.join(processed_words_list)

def alphanumeric_adjustment(text_sample):
    def if_word_valid(w):
        if w.isalnum() and not w.isalpha() and not w.isdigit():
            return False
        else:
            return True
    processed_words_list = [word for word in text_sample.split() if if_word_valid(word)]
    return ' '.join(processed_words_list)

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemm_sample(text_sample):
    wnl = nltk.stem.WordNetLemmatizer()
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(text_sample))
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    text_sample_processed = []
    for word, tag in wordnet_tagged:
        if tag is None:
            text_sample_processed.append(word)
        else:
            text_sample_processed.append(wnl.lemmatize(word, tag))
    return ' '.join(text_sample_processed)

def text_sample_lem_processing(text_sample, _stop_words_remove=False):
    text_sample_processed = unicodedata.normalize('NFKD', text_sample).encode('ascii','ignore').decode()
    text_sample_processed = strip_hashtags(text_sample_processed)
    text_sample_processed = punctuation_and_case(text_sample_processed)
    text_sample_processed = lowering(text_sample_processed)
    if _stop_words_remove:
        text_sample_processed = stop_words(text_sample_processed)
    text_sample_processed = lemm_sample(text_sample_processed)
    text_sample_processed = alphanumeric_adjustment(text_sample_processed)
    return text_sample_processed

def translate_text(text):
    sentences = sent_tokenize(text)
    translated_sentences = []
    for sentence in sentences:
        inputs = tokenizer.encode(sentence, return_tensors="pt")
        outputs = model_translate.generate(inputs, max_length=MAX_LENGTH, num_beams=NUM_BEAMS, early_stopping=EARLY_STOP)
        translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        translated_sentences.append(translated_text)
    translated_text = " ".join(translated_sentences)
    return translated_text

def translate_text_glaz(text):
    return translator(text)[0]['translation_text']

def calculate_cosine_similarity(v1, v2):
    emb1 = model.encode(v1, convert_to_tensor=True)
    emb2 = model.encode(v2, convert_to_tensor=True)
    cos_sim = util.cos_sim(emb1, emb2)
    return cos_sim.tolist()[0][0]

In [10]:
DATA_DIR = "data"
os.makedirs(f'../{DATA_DIR}', exist_ok=True)

dataset = load_dataset("medical_questions_pairs")
train_df = dataset["train"].to_pandas()
train_df.to_csv(f'../{DATA_DIR}/train_df.csv')

In [11]:
get_info_on_dataset(train_df)

Unnamed: 0,dr_id,question_1,question_2,label
0,1,After how many hour from drinking an antibiotic can I drink alcohol?,I have a party tonight and I took my last dose of Azithromycin this morning. Can I have a few drinks?,1
1,1,After how many hour from drinking an antibiotic can I drink alcohol?,I vomited this morning and I am not sure if it is the side effect of my antibiotic or the alcohol I took last night...,0
2,1,Am I over weight (192.9) for my age (39)?,I am a 39 y/o male currently weighing about 193 lbs. Do you think I am overweight?,1
3,1,Am I over weight (192.9) for my age (39)?,What diet is good for losing weight? Keto or vegan?,0
4,1,Aspirin allergy - is it worth getting a bracelet?,How much Aspirin can I take for my headache without causing any side effects?,0



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3048 entries, 0 to 3047
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   dr_id       3048 non-null   int32 
 1   question_1  3048 non-null   object
 2   question_2  3048 non-null   object
 3   label       3048 non-null   int64 
dtypes: int32(1), int64(1), object(2)
memory usage: 83.5+ KB



Unnamed: 0,Not_Valid_Values_in %
dr_id,0.0
question_1,0.0
question_2,0.0
label,0.0



Number of duplicated rows in the dataset: 0

Number of unique values in the dataset:
dr_id           11
question_1    1524
question_2    3043
label            2
dtype: int64


In [12]:
train_df['label'].value_counts()#normalize=True)

label
1    1524
0    1524
Name: count, dtype: int64

In [66]:
try:
    train_df = pd.read_csv(f'../{DATA_DIR}/train_df_processed.csv', index_col=[0])

except:
    train_df['question_1_processed'] = train_df['question_1'].map(lambda _sample: text_sample_lem_processing(_sample, _stop_words_remove=True))
    train_df['question_2_processed'] = train_df['question_2'].map(lambda _sample: text_sample_lem_processing(_sample, _stop_words_remove=True))
    train_df['len_question_1'] = train_df['question_1'].map(lambda x: len(x)) 
    train_df['len_question_2'] = train_df['question_2'].map(lambda x: len(x)) 
    train_df['text_len_question_1'] = train_df['question_1_processed'].map(lambda x: len(x)) 
    train_df['text_len_question_2'] = train_df['question_2_processed'].map(lambda x: len(x)) 
    train_df['rus_question_1'] = train_df['question_1'].apply(translate_text)
    train_df['rus_question_2'] = train_df['question_2'].apply(translate_text)
    train_df['rusgl_question_1'] = train_df['question_1'].apply(translate_text_glaz)
    train_df['rusgl_question_2'] = train_df['question_2'].apply(translate_text_glaz)

In [21]:
train_df['rusgl_question_1'] = train_df['question_1'].apply(translate_text_glaz)
train_df['rusgl_question_2'] = train_df['question_2'].apply(translate_text_glaz)

In [22]:
get_info_on_dataset(train_df)

Unnamed: 0,dr_id,question_1,question_2,label,question_1_processed,question_2_processed,len_question_1,len_question_2,text_len_question_1,text_len_question_2,rus_question_1,rus_question_2,rusgl_question_1,rusgl_question_2
0,1,After how many hour from drinking an antibiotic can I drink alcohol?,I have a party tonight and I took my last dose of Azithromycin this morning. Can I have a few drinks?,1,many hour drink antibiotic drink alcohol,party tonight take last dose azithromycin morning drink,68,101,40,55,Через сколько часов я смогу выпить антибиотик?,"У меня сегодня вечеринка, и я принял последнюю дозу азитромицина этим утром. Можно мне выпить?",После скольких часов после выпивки антибиотика я смогу выпить спиртного?,Можно мне выпить?
1,1,After how many hour from drinking an antibiotic can I drink alcohol?,I vomited this morning and I am not sure if it is the side effect of my antibiotic or the alcohol I took last night...,0,many hour drink antibiotic drink alcohol,vomit morning sure side effect antibiotic alcohol take last night,68,118,40,65,Через сколько часов я смогу выпить антибиотик?,"Утром меня вырвало, и я не уверен, что это побочный эффект моего антибиотика или алкоголя, который я выпил вчера вечером...",После скольких часов после выпивки антибиотика я смогу выпить спиртного?,"Утром меня вырвало и я не уверена , что это побочный эффект моего антибиотика или спирта , который я выпил прошлой ночью ."
2,1,Am I over weight (192.9) for my age (39)?,I am a 39 y/o male currently weighing about 193 lbs. Do you think I am overweight?,1,weight 192 9 age 39,39 male currently weigh 193 lbs think overweight,41,82,19,48,"Я уже больше веса (192,9) для моего возраста (39)?","У меня 39 й/о самца в настоящее время весит около 193 фунтов. Думаешь, у меня лишний вес?","Гораздо ли я старше своего возраста (192,9 лет) (39 лет) ?","В настоящее время у меня 39 й/о самца весом около 193 фунтов . Думаешь , у меня лишний вес ?"
3,1,Am I over weight (192.9) for my age (39)?,What diet is good for losing weight? Keto or vegan?,0,weight 192 9 age 39,diet good losing weight keto vegan,41,51,19,34,"Я уже больше веса (192,9) для моего возраста (39)?",Какая диета может похудеть? Кето или веган?,"Гораздо ли я старше своего возраста (192,9 лет) (39 лет) ?","Какая диета хороша для снижения веса , Кито или веган ?"
4,1,Aspirin allergy - is it worth getting a bracelet?,How much Aspirin can I take for my headache without causing any side effects?,0,aspirin allergy - worth get bracelet,much aspirin take headache without cause side effect,49,77,36,52,Аллергия на аспирин - стоит ли заполучить браслет?,"Сколько аспирин я могу выдержать от головной боли, не вызывая никаких побочных эффектов?",Аллергия на аспирин — стоит ли заполучить браслет?,Сколько аспирин я могу выдержать от головной боли без каких-либо побочных эффектов?



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3048 entries, 0 to 3047
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   dr_id                 3048 non-null   int32 
 1   question_1            3048 non-null   object
 2   question_2            3048 non-null   object
 3   label                 3048 non-null   int64 
 4   question_1_processed  3048 non-null   object
 5   question_2_processed  3048 non-null   object
 6   len_question_1        3048 non-null   int64 
 7   len_question_2        3048 non-null   int64 
 8   text_len_question_1   3048 non-null   int64 
 9   text_len_question_2   3048 non-null   int64 
 10  rus_question_1        3048 non-null   object
 11  rus_question_2        3048 non-null   object
 12  rusgl_question_1      3048 non-null   object
 13  rusgl_question_2      3048 non-null   object
dtypes: int32(1), int64(5), object(8)
memory usage: 321.6+ KB



Unnamed: 0,Not_Valid_Values_in %
dr_id,0.0
question_1,0.0
question_2,0.0
label,0.0
question_1_processed,0.0
question_2_processed,0.0
len_question_1,0.0
len_question_2,0.0
text_len_question_1,0.0
text_len_question_2,0.0



Number of duplicated rows in the dataset: 0

Number of unique values in the dataset:
dr_id                     11
question_1              1524
question_2              3043
label                      2
question_1_processed    1524
question_2_processed    3041
len_question_1           165
len_question_2           298
text_len_question_1      136
text_len_question_2      195
rus_question_1          1524
rus_question_2          3042
rusgl_question_1        1522
rusgl_question_2        3036
dtype: int64


In [23]:
train_df[['question_1_processed','question_1']].head()

Unnamed: 0,question_1_processed,question_1
0,many hour drink antibiotic drink alcohol,After how many hour from drinking an antibiotic can I drink alcohol?
1,many hour drink antibiotic drink alcohol,After how many hour from drinking an antibiotic can I drink alcohol?
2,weight 192 9 age 39,Am I over weight (192.9) for my age (39)?
3,weight 192 9 age 39,Am I over weight (192.9) for my age (39)?
4,aspirin allergy - worth get bracelet,Aspirin allergy - is it worth getting a bracelet?


In [24]:
train_df[['question_2_processed','question_2']].head()

Unnamed: 0,question_2_processed,question_2
0,party tonight take last dose azithromycin morning drink,I have a party tonight and I took my last dose of Azithromycin this morning. Can I have a few drinks?
1,vomit morning sure side effect antibiotic alcohol take last night,I vomited this morning and I am not sure if it is the side effect of my antibiotic or the alcohol I took last night...
2,39 male currently weigh 193 lbs think overweight,I am a 39 y/o male currently weighing about 193 lbs. Do you think I am overweight?
3,diet good losing weight keto vegan,What diet is good for losing weight? Keto or vegan?
4,much aspirin take headache without cause side effect,How much Aspirin can I take for my headache without causing any side effects?


In [25]:
print(len(train_df[train_df['len_question_1'] <= 0]))
print(len(train_df[train_df['len_question_2'] <= 0]))
print(len(train_df[train_df['text_len_question_1'] <= 0]))
print(len(train_df[train_df['text_len_question_2'] <= 0]))

0
0
0
0


In [26]:
train_df.describe()

Unnamed: 0,dr_id,label,len_question_1,len_question_2,text_len_question_1,text_len_question_2
count,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0
mean,6.037402,0.5,99.858924,110.334646,60.827428,65.508202
std,3.084721,0.500082,45.64073,63.029785,29.218568,37.832399
min,1.0,0.0,20.0,15.0,8.0,3.0
25%,3.0,0.0,58.0,62.0,36.0,37.0
50%,6.0,0.5,94.0,95.0,57.0,56.0
75%,9.0,1.0,144.0,146.0,83.25,86.0
max,11.0,1.0,255.0,404.0,191.0,257.0


In [27]:
column_to_translate = ['question_1','question_2']
translated_columns = ['rus_question_1','rus_question_2']
train_df[translated_columns+column_to_translate+['label']].head()

Unnamed: 0,rus_question_1,rus_question_2,question_1,question_2,label
0,Через сколько часов я смогу выпить антибиотик?,"У меня сегодня вечеринка, и я принял последнюю дозу азитромицина этим утром. Можно мне выпить?",After how many hour from drinking an antibiotic can I drink alcohol?,I have a party tonight and I took my last dose of Azithromycin this morning. Can I have a few drinks?,1
1,Через сколько часов я смогу выпить антибиотик?,"Утром меня вырвало, и я не уверен, что это побочный эффект моего антибиотика или алкоголя, который я выпил вчера вечером...",After how many hour from drinking an antibiotic can I drink alcohol?,I vomited this morning and I am not sure if it is the side effect of my antibiotic or the alcohol I took last night...,0
2,"Я уже больше веса (192,9) для моего возраста (39)?","У меня 39 й/о самца в настоящее время весит около 193 фунтов. Думаешь, у меня лишний вес?",Am I over weight (192.9) for my age (39)?,I am a 39 y/o male currently weighing about 193 lbs. Do you think I am overweight?,1
3,"Я уже больше веса (192,9) для моего возраста (39)?",Какая диета может похудеть? Кето или веган?,Am I over weight (192.9) for my age (39)?,What diet is good for losing weight? Keto or vegan?,0
4,Аллергия на аспирин - стоит ли заполучить браслет?,"Сколько аспирин я могу выдержать от головной боли, не вызывая никаких побочных эффектов?",Aspirin allergy - is it worth getting a bracelet?,How much Aspirin can I take for my headache without causing any side effects?,0


In [28]:
corpus = pd.concat([train_df['question_1_processed'],train_df['question_2_processed']]).drop_duplicates(keep='first')
corpus.head(), corpus.shape

(0                                                                            many hour drink antibiotic drink alcohol
 2                                                                                                 weight 192 9 age 39
 4                                                                                aspirin allergy - worth get bracelet
 6                          doctor visit hit head box wall contain hazardous material use syrinx use needle get infect
 8    antibiotic 4 top high tooth dentist cld get needle 2 freeze 2 extract gum really hurt say tissue hve 2 go bk plz
 dtype: object,
 (4562,))

In [29]:
corpus_full = pd.concat([train_df['question_1'],train_df['question_2']]).drop_duplicates(keep='first')
corpus_full.head(), corpus_full.shape

(0                                                                                    After how many hour from drinking an antibiotic can I drink alcohol?
 2                                                                                                               Am I over weight (192.9) for my age (39)?
 4                                                                                                       Aspirin allergy - is it worth getting a bracelet?
 6      At a doctor's visit, I hit my head against a box on the wall containing hazardous materials (used syringes, used needles...). Will I get infected?
 8    Been on antibiotics 4 5wks top high tooth dentist cld not get needle 2 freeze 2 extract in gum really hurt she said its the tissues hve 2 go bk? Plz
 dtype: object,
 (4567,))

In [30]:
tf_idf_vectorizer = TfidfVectorizer()

In [31]:
tf_idf_corpus = tf_idf_vectorizer.fit_transform(corpus)
tf_idf_q1 = tf_idf_vectorizer.transform(train_df['question_1_processed'])
tf_idf_q2 = tf_idf_vectorizer.transform(train_df['question_2_processed'])
tf_idf_q1_array = tf_idf_q1.toarray()
tf_idf_q2_array = tf_idf_q2.toarray()
tf_idf_corpus_array = tf_idf_corpus.toarray()

In [32]:
tf_idf_corpus_full = tf_idf_vectorizer.fit_transform(corpus_full)
tf_idf_q1_full = tf_idf_vectorizer.transform(train_df['question_1'])
tf_idf_q2_full = tf_idf_vectorizer.transform(train_df['question_2'])
tf_idf_q1_full_array = tf_idf_q1_full.toarray()
tf_idf_q2_full_array = tf_idf_q2_full.toarray()
tf_idf_corpus_full_array = tf_idf_corpus_full.toarray()

In [33]:
#tf_idf_vectorizer.vocabulary_

In [34]:
tf_idf_corpus_array.shape

(4562, 5004)

In [35]:
tf_idf_q1_ = pd.Series(tf_idf_q1_array.tolist())
tf_idf_q2_ = pd.Series(tf_idf_q2_array.tolist())
train_df['tf_idf_q1'] = tf_idf_q1_
train_df['tf_idf_q2'] = tf_idf_q2_
train_df['cdist_cb'] = [cdist([row['tf_idf_q1']], [row['tf_idf_q2']],'cityblock')[0][0] for _, row in train_df.iterrows()]
train_df['cdist_eu'] = [cdist([row['tf_idf_q1']], [row['tf_idf_q2']],'euclidean')[0][0] for _, row in train_df.iterrows()]

tf_idf_q1_full_ = pd.Series(tf_idf_q1_full_array.tolist())
tf_idf_q2_full_ = pd.Series(tf_idf_q2_full_array.tolist())
train_df['tf_idf_q1_full'] = tf_idf_q1_full_
train_df['tf_idf_q2_full'] = tf_idf_q2_full_
train_df['cdist_cb_full'] = [cdist([row['tf_idf_q1_full']], [row['tf_idf_q2_full']],'cityblock')[0][0] for _, row in train_df.iterrows()]
train_df['cdist_eu_full'] = [cdist([row['tf_idf_q1_full']], [row['tf_idf_q2_full']],'euclidean')[0][0] for _, row in train_df.iterrows()]

In [36]:
train_df[['question_1','question_2','label','cdist_cb','cdist_eu','cdist_cb_full','cdist_eu_full']].head()#.query('question_1 == @s')

Unnamed: 0,question_1,question_2,label,cdist_cb,cdist_eu,cdist_cb_full,cdist_eu_full
0,After how many hour from drinking an antibiotic can I drink alcohol?,I have a party tonight and I took my last dose of Azithromycin this morning. Can I have a few drinks?,1,4.174792,1.234628,6.473491,1.403838
1,After how many hour from drinking an antibiotic can I drink alcohol?,I vomited this morning and I am not sure if it is the side effect of my antibiotic or the alcohol I took last night...,0,3.79229,1.212119,6.186609,1.242795
2,Am I over weight (192.9) for my age (39)?,I am a 39 y/o male currently weighing about 193 lbs. Do you think I am overweight?,1,3.886488,1.231677,4.704204,1.221158
3,Am I over weight (192.9) for my age (39)?,What diet is good for losing weight? Keto or vegan?,0,3.759526,1.345225,4.581715,1.335119
4,Aspirin allergy - is it worth getting a bracelet?,How much Aspirin can I take for my headache without causing any side effects?,0,3.968279,1.242348,4.974074,1.274626


In [37]:
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

embeddings1 = model.encode(train_df['question_1'].tolist())
embeddings2 = model.encode(train_df['question_2'].tolist())
train_df['question_1_mLM'] = embeddings1.tolist()
train_df['question_2_mLM'] = embeddings2.tolist()
train_df['cos_sim_mLM'] = train_df.apply(lambda x: calculate_cosine_similarity(x['question_1'], x['question_2']), axis=1)

In [38]:
train_df[['question_1','question_2','label','cdist_cb','cdist_eu','cos_sim_mLM']].query('label == 1').sort_values(by='cos_sim_mLM', ascending=False).tail(5)

Unnamed: 0,question_1,question_2,label,cdist_cb,cdist_eu,cos_sim_mLM
626,How can I deal with anxeity without medication?,I have been dealing with bad anxiety for the past few weeks and I do not want to take any medicine. Are there any remedies or simple things that I can try to feel better?,1,4.998515,1.307715,0.314939
151,Bright red blood on TP after first wipe after BM; then other wipes no more blood. I drink way to much pop and very little water...if that matters. ?,What does blood in stool mean?,1,4.674102,1.300256,0.272176
1284,"My bm aren't solid but not quite loose. Looks more like for lack of better word ""shredded"" the why is this?",What causes stringy and shredded stools?,1,4.991187,1.414214,0.269483
47,How long does it take for herpes to break out?,I had unprotected sex on the 7th of this month. Is there a specific time that the first outbreak occurs?,1,4.677719,1.414214,0.231232
1483,If your number two is hard and dark what does that mean?,What can cause a hard and dark stool?,1,2.325646,0.978183,0.170247


In [39]:
train_df[['question_1','question_2','label','cdist_cb','cdist_eu','cos_sim_mLM']].query('label == 1').sort_values(by='cos_sim_mLM', ascending=False).head(5)

Unnamed: 0,question_1,question_2,label,cdist_cb,cdist_eu,cos_sim_mLM
2430,What are some really good ways to stimulate adrenaline?,Are there any good ways to stimulate adrenaline?,1,0.542992,0.396806,0.983748
2762,Are monochorionic-diamniotic twins usually healthy at birth?,Are monochorionic-diamniotic twins healthy at the time of birth?,1,0.667612,0.43532,0.980839
2154,What are the common symptoms of bruises?,Describe some common symptoms of bruises.,1,1.054575,0.689815,0.974392
1957,Can seroquel (quetiapine) cause gerd?,Is seroquel (quetiapine) known to cause GERD?,1,0.347138,0.276526,0.97375
1154,How are batteries chosen to be put in pacemakers?,How are the batteries selected that are put into pacemakers?,1,1.120431,0.760559,0.972377


In [40]:
train_df.describe()

Unnamed: 0,dr_id,label,len_question_1,len_question_2,text_len_question_1,text_len_question_2,cdist_cb,cdist_eu,cdist_cb_full,cdist_eu_full,cos_sim_mLM
count,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0
mean,6.037402,0.5,99.858924,110.334646,60.827428,65.508202,3.633939,1.056634,5.062471,1.117168,0.668521
std,3.084721,0.500082,45.64073,63.029785,29.218568,37.832399,1.411628,0.23379,1.605644,0.192646,0.164099
min,1.0,0.0,20.0,15.0,8.0,3.0,0.0,0.0,0.17693,0.14879,0.000157
25%,3.0,0.0,58.0,62.0,36.0,37.0,2.625003,0.910502,3.925967,1.002222,0.561743
50%,6.0,0.5,94.0,95.0,57.0,56.0,3.604927,1.089296,5.069853,1.144142,0.686542
75%,9.0,1.0,144.0,146.0,83.25,86.0,4.659162,1.233493,6.250337,1.258669,0.79407
max,11.0,1.0,255.0,404.0,191.0,257.0,7.906999,1.414214,9.875228,1.414214,0.983748


In [41]:
scaler_cols = ['cdist_cb','cdist_eu','cdist_cb_full','cdist_eu_full']
scaled_cols = ['cdist_cb_scl','cdist_eu_scl','cdist_cb_full_scl','cdist_eu_full_scl']

scaler = MinMaxScaler()

scaled_data = pd.DataFrame(scaler.fit_transform(train_df[scaler_cols]), columns=scaled_cols)
scaled_data.describe()

Unnamed: 0,cdist_cb_scl,cdist_eu_scl,cdist_cb_full_scl,cdist_eu_full_scl
count,3048.0,3048.0,3048.0,3048.0
mean,0.459585,0.747153,0.503752,0.76526
std,0.178529,0.165315,0.165559,0.152238
min,0.0,0.0,0.0,0.0
25%,0.331985,0.643822,0.386566,0.674424
50%,0.455916,0.770249,0.504514,0.786576
75%,0.589245,0.872212,0.626234,0.877081
max,1.0,1.0,1.0,1.0


In [42]:
train_df = pd.concat([train_df,scaled_data],axis=1)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3048 entries, 0 to 3047
Data columns (total 29 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   dr_id                 3048 non-null   int32  
 1   question_1            3048 non-null   object 
 2   question_2            3048 non-null   object 
 3   label                 3048 non-null   int64  
 4   question_1_processed  3048 non-null   object 
 5   question_2_processed  3048 non-null   object 
 6   len_question_1        3048 non-null   int64  
 7   len_question_2        3048 non-null   int64  
 8   text_len_question_1   3048 non-null   int64  
 9   text_len_question_2   3048 non-null   int64  
 10  rus_question_1        3048 non-null   object 
 11  rus_question_2        3048 non-null   object 
 12  rusgl_question_1      3048 non-null   object 
 13  rusgl_question_2      3048 non-null   object 
 14  tf_idf_q1             3048 non-null   object 
 15  tf_idf_q2            

In [43]:
train_df.query('label == 0').describe()

Unnamed: 0,dr_id,label,len_question_1,len_question_2,text_len_question_1,text_len_question_2,cdist_cb,cdist_eu,cdist_cb_full,cdist_eu_full,cos_sim_mLM,cdist_cb_scl,cdist_eu_scl,cdist_cb_full_scl,cdist_eu_full_scl
count,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0
mean,6.037402,0.0,99.858924,102.805774,60.827428,61.537402,4.086813,1.147904,5.569258,1.199737,0.578167,0.51686,0.811691,0.556008,0.83051
std,3.085227,0.0,45.648221,63.587674,29.223364,38.311359,1.345463,0.192019,1.49717,0.151092,0.149041,0.170161,0.135778,0.154375,0.1194
min,1.0,0.0,20.0,15.0,8.0,3.0,0.0,0.0,1.183838,0.46568,0.000157,0.0,0.0,0.103823,0.250422
25%,3.0,0.0,58.0,54.0,36.0,33.0,3.126852,1.032858,4.51147,1.116908,0.481035,0.395454,0.730341,0.446938,0.765055
50%,6.0,0.0,94.0,84.0,57.0,50.0,4.106692,1.183101,5.637315,1.223739,0.593105,0.519374,0.836579,0.563025,0.849477
75%,9.0,0.0,144.0,139.0,83.25,81.25,5.042316,1.291673,6.659042,1.308157,0.681844,0.637703,0.91335,0.668376,0.916189
max,11.0,0.0,255.0,370.0,191.0,254.0,7.906999,1.414214,9.875228,1.414214,0.96688,1.0,1.0,1.0,1.0


In [44]:
train_df.query('label == 1').describe()

Unnamed: 0,dr_id,label,len_question_1,len_question_2,text_len_question_1,text_len_question_2,cdist_cb,cdist_eu,cdist_cb_full,cdist_eu_full,cos_sim_mLM,cdist_cb_scl,cdist_eu_scl,cdist_cb_full_scl,cdist_eu_full_scl
count,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0
mean,6.037402,1.0,99.858924,117.863517,60.827428,69.479003,3.181065,0.965363,4.555684,1.034599,0.758876,0.40231,0.682615,0.451497,0.70001
std,3.085227,0.0,45.648221,61.573382,29.223364,36.935147,1.328896,0.236239,1.549952,0.194361,0.123785,0.168066,0.167046,0.159817,0.153593
min,1.0,1.0,20.0,20.0,8.0,11.0,0.0,0.0,0.17693,0.14879,0.170247,0.0,0.0,0.0,0.0
25%,3.0,1.0,58.0,70.0,36.0,42.0,2.237006,0.820551,3.452167,0.921481,0.691498,0.282915,0.580217,0.337713,0.610619
50%,6.0,1.0,94.0,105.0,57.0,62.0,3.119905,0.994338,4.521077,1.053182,0.780398,0.394575,0.703103,0.447929,0.714695
75%,9.0,1.0,144.0,153.0,83.25,89.0,4.089109,1.135938,5.7054,1.17396,0.846482,0.517151,0.803229,0.570045,0.81014
max,11.0,1.0,255.0,404.0,191.0,257.0,7.079948,1.414214,9.481837,1.414214,0.983748,0.895403,1.0,0.959437,1.0


In [45]:
def accuracy_topN(question_1, question_2, def_predict, metric_=None, N=5):

    corr_pred = 0
    all_pred = 0
    
    question_1_array = np.vstack(question_1)
    question_2_array = np.vstack(question_2)

    if def_predict=='cdist_':
        predict_1 = np.array([cdist(question_2_array, np.expand_dims(q1, axis=0), metric=metric_).reshape(-1) for q1 in question_1_array])
        predict_2 = np.array([cdist(question_1_array, np.expand_dims(q2, axis=0), metric=metric_).reshape(-1) for q2 in question_2_array])
    elif def_predict=='cos_sim_':
        predict_1 = np.array([util.cos_sim(question_2_array, q1).reshape(-1) for q1 in question_1_array])
        predict_2 = np.array([util.cos_sim(question_1_array, q2).reshape(-1) for q2 in question_2_array])
        
    for i in range(len(question_1_array)):
        values_top = np.argsort(predict_1[i])[::-1][:N]  
        if i in values_top:
            corr_pred += 1
        all_pred += 1
    
    for i in range(len(question_2_array)):
        values_top = np.argsort(predict_2[i])[::-1][:N]
        if i in values_top:
            corr_pred += 1
        all_pred += 1
    
    accuracy_top = corr_pred / all_pred
    return accuracy_top


In [46]:
accuracy_top5_cos_sim = accuracy_topN(train_df['question_1_mLM'], train_df['question_2_mLM'], def_predict='cos_sim_', N=5)
accuracy_top10_cos_sim = accuracy_topN(train_df['question_1_mLM'], train_df['question_2_mLM'], def_predict='cos_sim_', N=10)
accuracy_top5_cos_sim, accuracy_top10_cos_sim

(0.7791994750656168, 0.8315288713910761)

In [47]:
accuracy_top5_cos_sim_tf = accuracy_topN(train_df['tf_idf_q1'], train_df['tf_idf_q2'], def_predict='cos_sim_', N=5)
accuracy_top10_cos_sim_tf = accuracy_topN(train_df['tf_idf_q1'], train_df['tf_idf_q2'], def_predict='cos_sim_', N=10)
accuracy_top5_cos_sim_tf, accuracy_top10_cos_sim_tf

(0.7624671916010499, 0.8157808398950132)

In [48]:
accuracy_top5_cdist_cb = accuracy_topN(train_df['question_1_mLM'], train_df['question_2_mLM'], def_predict='cdist_', metric_='cityblock', N=5)
accuracy_top10_cdist_cb = accuracy_topN(train_df['question_1_mLM'], train_df['question_2_mLM'], def_predict='cdist_', metric_='cityblock', N=10)
accuracy_top5_cdist_cb, accuracy_top10_cdist_cb

(0.0, 0.0)

In [49]:
accuracy_top5_cdist_eu = accuracy_topN(train_df['question_1_mLM'], train_df['question_2_mLM'], def_predict='cdist_', metric_='euclidean', N=5)
accuracy_top10_cdist_eu = accuracy_topN(train_df['question_1_mLM'], train_df['question_2_mLM'], def_predict='cdist_', metric_='euclidean', N=10)
accuracy_top5_cdist_eu, accuracy_top10_cdist_eu

(0.0, 0.0)

In [50]:
# accuracy_top5_cdist_cb_tf = accuracy_topN(train_df['tf_idf_q1'], train_df['tf_idf_q2'], def_predict='cdist_', metric_='cityblock', N=5)
# accuracy_top10_cdist_cb_tf = accuracy_topN(train_df['tf_idf_q1'], train_df['tf_idf_q2'], def_predict='cdist_', metric_='cityblock', N=10)
# accuracy_top5_cdist_cb_tf, accuracy_top10_cdist_cb_tf

# accuracy_top5_cdist_eu_tf = accuracy_topN(train_df['tf_idf_q1'], train_df['tf_idf_q2'], def_predict='cdist_', metric_='euclidean', N=5)
# accuracy_top10_cdist_eu_tf = accuracy_topN(train_df['tf_idf_q1'], train_df['tf_idf_q2'], def_predict='cdist_', metric_='euclidean', N=10)
# accuracy_top5_cdist_eu_tf, accuracy_top10_cdist_eu_tf

# accuracy_top5_cos_sim_tf_full = accuracy_topN(train_df['tf_idf_q1_full'], train_df['tf_idf_q2_full'], def_predict='cos_sim_', N=5)
# accuracy_top10_cos_sim_tf_full = accuracy_topN(train_df['tf_idf_q1_full'], train_df['tf_idf_q2_full'], def_predict='cos_sim_', N=10)
# accuracy_top5_cos_sim_tf_full, accuracy_top10_cos_sim_tf_full

# accuracy_top5_cdist_cb_tf_full = accuracy_topN(train_df['tf_idf_q1_full'], train_df['tf_idf_q2_full'], def_predict='cdist_', metric_='cityblock', N=5)
# accuracy_top10_cdist_cb_tf_full = accuracy_topN(train_df['tf_idf_q1_full'], train_df['tf_idf_q2_full'], def_predict='cdist_', metric_='cityblock', N=10)
# accuracy_top5_cdist_cb_tf_full, accuracy_top10_cdist_cb_tf_full

# accuracy_top5_cdist_eu_tf_full = accuracy_topN(train_df['tf_idf_q1_full'], train_df['tf_idf_q2_full'], def_predict='cdist_', metric_='euclidean', N=5)
# accuracy_top10_cdist_eu_tf_full = accuracy_topN(train_df['tf_idf_q1_full'], train_df['tf_idf_q2_full'], def_predict='cdist_', metric_='euclidean', N=10)
# accuracy_top5_cdist_eu_tf_full, accuracy_top10_cdist_eu_tf_full

In [51]:
questions = pd.concat([train_df['question_1'],train_df['question_2']],axis=0).drop_duplicates(keep='first')
questions_embeddings = model.encode(questions.tolist())

In [64]:
q = questions.sample().values[0]

inputs = tokenizer.encode(q, return_tensors="pt")
outputs = model_translate.generate(inputs, max_length=1000, num_beams=5, early_stopping=False)
translated_question = tokenizer.decode(outputs[0], skip_special_tokens=True)

question_embedding = model.encode(q)
predict = np.array([util.cos_sim(questions_embeddings, question_embedding)])

N = 11
values_top = np.argsort(predict.reshape(-1))[::-1][:N+1]

df_ = pd.DataFrame(columns=['questions', 'translated_questions','cos_sim']) 
for val in values_top:
    if questions.tolist()[val] == q:
        pass        
    else: 
        q_ = questions.tolist()[val]
        cos_sim_ = predict.reshape(-1)[val].round(1)
        translated_q_ = translate_text(q_)
        df_.loc[val] = [q_,translated_q_,cos_sim_]

print(q, translated_question)
display(df_[['questions', 'translated_questions','cos_sim']])

Is it normal for ears to pop if one lifts heavy weights? Это нормально для ушей, если кто-то поднимает тяжелые грузы?


Unnamed: 0,questions,translated_questions,cos_sim
823,"Why do my ears ""pop"" when I am weight lifting?","Почему мои уши ""поп"", когда я поднимаю вес?",0.8
2191,Do you think it is okay to use OTC ear drops if you have swimmer's ear?,"Ты думаешь, это нормально использовать уши OTC, если у тебя есть ухо пловца?",0.6
3165,Can ear popping be due to an infection?,Может ли вырваться ухо из-за инфекции?,0.5
3482,Is diving/swimming in a pool allowed while dealing with an ear infection?,Разрешается ли нырять/плавать в бассейне при борьбе с ушной инфекцией?,0.5
2072,"I think my ears are not producing enough wax, is it some medical conditon?","Я думаю, что мои уши не производят достаточно воска, это какой-то медицинский кондитон?",0.5
2766,Is it safe to soak my ear rings in rubbing alcohol before putting them on?,"Безопасно промочить мои уши кольца, втирая алкоголь, прежде чем надеть их?",0.5
1296,"I have tenderness/swelling in front of my right ear, most noticeable when mouth is open, feels like it could be top of jaw. Sharp pain in ear area too?","У меня нежность и нежность перед моим правым ухом, наиболее заметная, когда рот открыт, кажется, что он может быть верх челюсти. Острая боль в ушах тоже?",0.5
795,Pushups - should hr or BP drop first? for me the hr drops quick but neck still pulsating very hard so is that dangerous to have low hr and high bp?,"Нажимать - сначала нужно грохнуть чёрный или BP? Для меня время падает быстро, но шея все еще пульсирует очень сильно, так что это опасно иметь низкое чёрное и высокое давление?",0.5
4111,"I have perceptible tenderness and swelling in front of my ear, mostly on opening of the jaw along with ear pain. What could cause it?","У меня заметная нежность и опухоль перед ухом, главным образом при открытии челюсти вместе с болью в ухе. Что может вызвать это?",0.5
3563,How does an ear tube work?,Как работает ушная трубка?,0.5


In [65]:
train_df.query('(question_1 == @q|question_2 == @q)')[['question_1','question_2','label','cos_sim_mLM']]

Unnamed: 0,question_1,question_2,label,cos_sim_mLM
1647,"Why do my ears ""pop"" when I am weight lifting?",Is it normal for ears to pop if one lifts heavy weights?,1,0.805086


In [63]:
train_df.to_csv(f'../{DATA_DIR}/train_df_processed.csv')
train_df = pd.read_csv(f'../{DATA_DIR}/train_df_processed.csv', index_col=[0])

In [62]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3048 entries, 0 to 3047
Data columns (total 29 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   dr_id                 3048 non-null   int64  
 1   question_1            3048 non-null   object 
 2   question_2            3048 non-null   object 
 3   label                 3048 non-null   int64  
 4   question_1_processed  3048 non-null   object 
 5   question_2_processed  3048 non-null   object 
 6   len_question_1        3048 non-null   int64  
 7   len_question_2        3048 non-null   int64  
 8   text_len_question_1   3048 non-null   int64  
 9   text_len_question_2   3048 non-null   int64  
 10  rus_question_1        3048 non-null   object 
 11  rus_question_2        3048 non-null   object 
 12  rusgl_question_1      3048 non-null   object 
 13  rusgl_question_2      3048 non-null   object 
 14  tf_idf_q1             3048 non-null   object 
 15  tf_idf_q2             3048