In [1]:
import os
import re
import string
import itertools
import unicodedata

import pandas as pd
import numpy as np

from scipy.spatial.distance import cdist
#from scipy.spatial import distance
from sklearn.preprocessing import MinMaxScaler

from datasets import load_dataset

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from sentence_transformers import SentenceTransformer, util

from sklearn.feature_extraction.text import TfidfVectorizer

import warnings

pd.set_option('display.max_rows', None)
pd.set_option("display.max_colwidth", 500)
pd.options.mode.chained_assignment = None
warnings.filterwarnings(action="ignore", message="Mean of empty slice")
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simami/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/simami/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/simami/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/simami/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/simami/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def get_info_on_dataset(_df):
    CONST_IN_PERCENTAGES = float(100.0)
    try:
        display(_df.head())
        print("")
        _df.info()
        df_review_info = pd.DataFrame({'Not_Valid_Values_in %': (_df.isna().sum()/len(_df))*CONST_IN_PERCENTAGES} )
        print("")
        display(df_review_info.sort_values(by='Not_Valid_Values_in %', ascending=False))
        print("")
        print("Number of duplicated rows in the dataset: {}".format(_df.duplicated().sum()))
        print("")
        print("Number of unique values in the dataset:\n{}".format(_df.nunique()))
    except:
        print("Can't provide review on given dataset")

def adjust_total_stop_words_list(_stop_words_list):
    total_stop_words_sample = ' '.join(_stop_words_list)
    total_stop_words_sample_processed = unicodedata.normalize('NFKD',
                                                              total_stop_words_sample).encode('ascii','ignore').decode()
    total_stop_words_sample_processed = punctuation_and_case(total_stop_words_sample_processed)
    total_stop_words_list_processed = total_stop_words_sample_processed.split(" ")
    return total_stop_words_list_processed

def punctuation_and_case(text_sample):
    string_punctuation = string.punctuation.replace('-','') + "’"
    punctuation_map = str.maketrans(string_punctuation, ' '*len(string_punctuation))
    return text_sample.translate(punctuation_map)

In [3]:
STOP_WORDS_LANGUAGES_LIST_CONST = ['english'] 
total_stop_words_lists = [stopwords.words(_lng) for _lng in STOP_WORDS_LANGUAGES_LIST_CONST]
total_stop_words_list = list(itertools.chain(*total_stop_words_lists))
TOTAL_STOP_WORDS_LIST_PROCESSED_CONST = adjust_total_stop_words_list(total_stop_words_list)
TF_IDF_MIN_DF_VALUE_CONST = int(3)
TF_IDF_MAX_DF_VALUE_CONST = float(0.5)
MAX_LENGTH = 100
NUM_BEAMS = 3
EARLY_STOP = True

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
model_translate = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru")

model_translate_glaz = "glazzova/ml_translation_model1"
translator = pipeline("translation", model=model_translate_glaz)

In [4]:
def lowering(text_sample):
    processed_words_list = [w.lower() for w in text_sample.split()]
    return ' '.join(processed_words_list)

def stop_words(text_sample, _stop_words_ = TOTAL_STOP_WORDS_LIST_PROCESSED_CONST):
    text_sample_processed = [word for word in text_sample.split() if word not in _stop_words_]
    return ' '.join(text_sample_processed)

def strip_hashtags(text_sample):
    processed_words_list = []
    for word in text_sample.split(' '):
        word = word.strip()
        if word:
            if word.find('@') < 0 and word.find('#') < 0 :
                processed_words_list.append(word)
    return ' '.join(processed_words_list)

def alphanumeric_adjustment(text_sample):
    def if_word_valid(w):
        if w.isalnum() and not w.isalpha() and not w.isdigit():
            return False
        else:
            return True
    processed_words_list = [word for word in text_sample.split() if if_word_valid(word)]
    return ' '.join(processed_words_list)

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemm_sample(text_sample):
    wnl = nltk.stem.WordNetLemmatizer()
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(text_sample))
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    text_sample_processed = []
    for word, tag in wordnet_tagged:
        if tag is None:
            text_sample_processed.append(word)
        else:
            text_sample_processed.append(wnl.lemmatize(word, tag))
    return ' '.join(text_sample_processed)

def text_sample_lem_processing(text_sample, _stop_words_remove=False):
    text_sample_processed = unicodedata.normalize('NFKD', text_sample).encode('ascii','ignore').decode()
    text_sample_processed = strip_hashtags(text_sample_processed)
    text_sample_processed = punctuation_and_case(text_sample_processed)
    text_sample_processed = lowering(text_sample_processed)
    if _stop_words_remove:
        text_sample_processed = stop_words(text_sample_processed)
    text_sample_processed = lemm_sample(text_sample_processed)
    text_sample_processed = alphanumeric_adjustment(text_sample_processed)
    return text_sample_processed

def translate_text(text):
    sentences = sent_tokenize(text)
    translated_sentences = []
    for sentence in sentences:
        inputs = tokenizer.encode(sentence, return_tensors="pt")
        outputs = model_translate.generate(inputs, max_length=MAX_LENGTH, num_beams=NUM_BEAMS, early_stopping=EARLY_STOP)
        translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        translated_sentences.append(translated_text)
    translated_text = " ".join(translated_sentences)
    return translated_text

def translate_text_glaz(text):
    return translator(text)[0]['translation_text']

def calculate_cosine_similarity(v1, v2):
    emb1 = model.encode(v1, convert_to_tensor=True)
    emb2 = model.encode(v2, convert_to_tensor=True)
    cos_sim = util.cos_sim(emb1, emb2)
    return cos_sim.tolist()[0][0]

In [5]:
DATA_DIR = "data"
os.makedirs(f'../{DATA_DIR}', exist_ok=True)

dataset = load_dataset("medical_questions_pairs")
train_df = dataset["train"].to_pandas()
train_df.to_csv(f'../{DATA_DIR}/train_df.csv')

In [6]:
get_info_on_dataset(train_df)

Unnamed: 0,dr_id,question_1,question_2,label
0,1,After how many hour from drinking an antibiotic can I drink alcohol?,I have a party tonight and I took my last dose of Azithromycin this morning. Can I have a few drinks?,1
1,1,After how many hour from drinking an antibiotic can I drink alcohol?,I vomited this morning and I am not sure if it is the side effect of my antibiotic or the alcohol I took last night...,0
2,1,Am I over weight (192.9) for my age (39)?,I am a 39 y/o male currently weighing about 193 lbs. Do you think I am overweight?,1
3,1,Am I over weight (192.9) for my age (39)?,What diet is good for losing weight? Keto or vegan?,0
4,1,Aspirin allergy - is it worth getting a bracelet?,How much Aspirin can I take for my headache without causing any side effects?,0



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3048 entries, 0 to 3047
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   dr_id       3048 non-null   int32 
 1   question_1  3048 non-null   object
 2   question_2  3048 non-null   object
 3   label       3048 non-null   int64 
dtypes: int32(1), int64(1), object(2)
memory usage: 83.5+ KB



Unnamed: 0,Not_Valid_Values_in %
dr_id,0.0
question_1,0.0
question_2,0.0
label,0.0



Number of duplicated rows in the dataset: 0

Number of unique values in the dataset:
dr_id           11
question_1    1524
question_2    3043
label            2
dtype: int64


In [8]:
train_df['label'].value_counts()#normalize=True)

label
1    1524
0    1524
Name: count, dtype: int64

In [9]:
try:
    train_df = pd.read_csv(f'../{DATA_DIR}/train_df_processed.csv', index_col=[0])

except:
    train_df['question_1_processed'] = train_df['question_1'].map(lambda _sample: text_sample_lem_processing(_sample, _stop_words_remove=True))
    train_df['question_2_processed'] = train_df['question_2'].map(lambda _sample: text_sample_lem_processing(_sample, _stop_words_remove=True))
    train_df['len_question_1'] = train_df['question_1'].map(lambda x: len(x)) 
    train_df['len_question_2'] = train_df['question_2'].map(lambda x: len(x)) 
    train_df['text_len_question_1'] = train_df['question_1_processed'].map(lambda x: len(x)) 
    train_df['text_len_question_2'] = train_df['question_2_processed'].map(lambda x: len(x)) 
    train_df['rus_question_1'] = train_df['question_1'].apply(translate_text)
    train_df['rus_question_2'] = train_df['question_2'].apply(translate_text)
    train_df['rusgl_question_1'] = train_df['question_1'].apply(translate_text_glaz)
    train_df['rusgl_question_2'] = train_df['question_2'].apply(translate_text_glaz)

In [10]:
get_info_on_dataset(train_df)

Unnamed: 0,dr_id,question_1,question_2,label,question_1_processed,question_2_processed,len_question_1,len_question_2,text_len_question_1,text_len_question_2,...,question_2_mLM,cos_sim_mLM,cdist_cb_scl,cdist_eu_scl,cdist_cb_full_scl,cdist_eu_full_scl,cdist_cb_scl.1,cdist_eu_scl.1,cdist_cb_full_scl.1,cdist_eu_full_scl.1
0,1,After how many hour from drinking an antibiotic can I drink alcohol?,I have a party tonight and I took my last dose of Azithromycin this morning. Can I have a few drinks?,1,many hour drink antibiotic drink alcohol,party tonight take last dose azithromycin morning drink,68,101,40,55,...,"[-0.03977625072002411, -0.07884813845157623, 0.30240631103515625, 0.28421327471733093, 0.05667344108223915, -0.05211157351732254, 0.4980890452861786, 0.2808838486671448, 0.09993268549442291, -0.3411419093608856, -0.5050927400588989, 0.0398346483707428, 0.09492535889148712, 0.22931823134422302, 0.2572515308856964, 0.24463991820812225, 0.29866480827331543, 0.01386759988963604, -0.19019213318824768, 0.14380453526973724, -0.2026463896036148, -0.21295495331287384, 0.19510138034820557, -0.06438546...",0.588961,0.527987,0.873014,0.649244,0.991801,0.527987,0.873014,0.649244,0.991801
1,1,After how many hour from drinking an antibiotic can I drink alcohol?,I vomited this morning and I am not sure if it is the side effect of my antibiotic or the alcohol I took last night...,0,many hour drink antibiotic drink alcohol,vomit morning sure side effect antibiotic alcohol take last night,68,118,40,65,...,"[0.030550871044397354, -0.2725229561328888, 0.28308358788490295, 0.2389177829027176, 0.547038733959198, -0.06358105689287186, 0.16300421953201294, 0.4613649249076843, 0.2518417239189148, -0.365054726600647, -0.4040278196334839, -0.04706273600459099, -0.23492838442325592, 0.41593730449676514, -0.12821915745735168, -0.30410000681877136, 0.23168830573558807, -0.30251455307006836, -0.22669664025306702, 0.17331808805465698, 0.06983417272567749, 0.3902554512023926, 0.06295904517173767, -0.25325363...",0.493185,0.479612,0.857097,0.619663,0.864537,0.479612,0.857097,0.619663,0.864537
2,1,Am I over weight (192.9) for my age (39)?,I am a 39 y/o male currently weighing about 193 lbs. Do you think I am overweight?,1,weight 192 9 age 39,39 male currently weigh 193 lbs think overweight,41,82,19,48,...,"[0.1520807147026062, 0.08130066841840744, 0.2691918909549713, 0.3550529479980469, 0.07510621100664139, -0.47831249237060547, -0.14506417512893677, 0.13626369833946228, -0.13239119946956635, -0.19853608310222626, -0.26709210872650146, -0.22806473076343536, -0.5486209392547607, -0.5228369235992432, -0.026387745514512062, -0.15675035119056702, 0.17605580389499664, -0.09945523738861084, -0.11129286140203476, 0.28609374165534973, -0.07254518568515778, 0.16800343990325928, -0.05013829469680786, -0...",0.719577,0.491525,0.870927,0.466811,0.847438,0.491525,0.870927,0.466811,0.847438
3,1,Am I over weight (192.9) for my age (39)?,What diet is good for losing weight? Keto or vegan?,0,weight 192 9 age 39,diet good losing weight keto vegan,41,51,19,34,...,"[-0.28753989934921265, 0.2143070101737976, 0.34415721893310547, 0.2424003779888153, 0.30227425694465637, -0.1321236491203308, 0.34400784969329834, 0.24250201880931854, -0.02895461767911911, -0.4454647898674011, -0.13960103690624237, -0.6859024167060852, -0.4456571936607361, -0.466056227684021, 0.011433128267526627, -0.44420307874679565, 0.47957274317741394, 0.7352493405342102, 0.23369291424751282, -0.2526901364326477, -0.18232698738574982, 0.28957968950271606, 0.6657700538635254, 0.674940407...",0.362271,0.475468,0.951217,0.454181,0.937496,0.475468,0.951217,0.454181,0.937496
4,1,Aspirin allergy - is it worth getting a bracelet?,How much Aspirin can I take for my headache without causing any side effects?,0,aspirin allergy - worth get bracelet,much aspirin take headache without cause side effect,49,77,36,52,...,"[-0.5224871635437012, -0.33884501457214355, -0.2440420389175415, 0.7325851917266846, 0.12117338180541992, 0.10323136299848557, -0.14183905720710754, 0.3972076177597046, -0.2653556764125824, -0.661288321018219, -0.004241823218762875, 0.6911184191703796, -0.01254990603774786, -0.1943574696779251, -0.11973315477371216, 0.09733694046735764, 0.35733434557914734, -0.36427488923072815, 0.008246737532317638, 0.0819404348731041, -0.241068035364151, -0.07919424772262573, 0.02077319845557213, 0.1260730...",0.562436,0.501869,0.878473,0.494638,0.889691,0.501869,0.878473,0.494638,0.889691



<class 'pandas.core.frame.DataFrame'>
Index: 3048 entries, 0 to 3047
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   dr_id                 3048 non-null   int64  
 1   question_1            3048 non-null   object 
 2   question_2            3048 non-null   object 
 3   label                 3048 non-null   int64  
 4   question_1_processed  3048 non-null   object 
 5   question_2_processed  3048 non-null   object 
 6   len_question_1        3048 non-null   int64  
 7   len_question_2        3048 non-null   int64  
 8   text_len_question_1   3048 non-null   int64  
 9   text_len_question_2   3048 non-null   int64  
 10  rus_question_1        3048 non-null   object 
 11  rus_question_2        3048 non-null   object 
 12  rusgl_question_1      3048 non-null   object 
 13  rusgl_question_2      3048 non-null   object 
 14  tf_idf_q1             3048 non-null   object 
 15  tf_idf_q2             304

Unnamed: 0,Not_Valid_Values_in %
dr_id,0.0
cdist_eu,0.0
cdist_cb_full_scl.1,0.0
cdist_eu_scl.1,0.0
cdist_cb_scl.1,0.0
cdist_eu_full_scl,0.0
cdist_cb_full_scl,0.0
cdist_eu_scl,0.0
cdist_cb_scl,0.0
cos_sim_mLM,0.0



Number of duplicated rows in the dataset: 0

Number of unique values in the dataset:
dr_id                     11
question_1              1524
question_2              3043
label                      2
question_1_processed    1524
question_2_processed    3041
len_question_1           165
len_question_2           298
text_len_question_1      136
text_len_question_2      195
rus_question_1          1524
rus_question_2          3042
rusgl_question_1        1522
rusgl_question_2        3036
tf_idf_q1               1524
tf_idf_q2               3041
cdist_cb                3038
cdist_eu                2880
tf_idf_q1_full          1524
tf_idf_q2_full          3043
cdist_cb_full           3048
cdist_eu_full           2966
question_1_mLM          1532
question_2_mLM          3043
cos_sim_mLM             3048
cdist_cb_scl            3038
cdist_eu_scl            2880
cdist_cb_full_scl       3048
cdist_eu_full_scl       2965
cdist_cb_scl.1          3038
cdist_eu_scl.1          2880
cdist_cb_full_s

In [11]:
train_df[['question_1_processed','question_1']].head()

Unnamed: 0,question_1_processed,question_1
0,many hour drink antibiotic drink alcohol,After how many hour from drinking an antibiotic can I drink alcohol?
1,many hour drink antibiotic drink alcohol,After how many hour from drinking an antibiotic can I drink alcohol?
2,weight 192 9 age 39,Am I over weight (192.9) for my age (39)?
3,weight 192 9 age 39,Am I over weight (192.9) for my age (39)?
4,aspirin allergy - worth get bracelet,Aspirin allergy - is it worth getting a bracelet?


In [12]:
train_df[['question_2_processed','question_2']].head()

Unnamed: 0,question_2_processed,question_2
0,party tonight take last dose azithromycin morning drink,I have a party tonight and I took my last dose of Azithromycin this morning. Can I have a few drinks?
1,vomit morning sure side effect antibiotic alcohol take last night,I vomited this morning and I am not sure if it is the side effect of my antibiotic or the alcohol I took last night...
2,39 male currently weigh 193 lbs think overweight,I am a 39 y/o male currently weighing about 193 lbs. Do you think I am overweight?
3,diet good losing weight keto vegan,What diet is good for losing weight? Keto or vegan?
4,much aspirin take headache without cause side effect,How much Aspirin can I take for my headache without causing any side effects?


In [13]:
print(len(train_df[train_df['len_question_1'] <= 0]))
print(len(train_df[train_df['len_question_2'] <= 0]))
print(len(train_df[train_df['text_len_question_1'] <= 0]))
print(len(train_df[train_df['text_len_question_2'] <= 0]))

0
0
0
0


In [14]:
train_df.describe()

Unnamed: 0,dr_id,label,len_question_1,len_question_2,text_len_question_1,text_len_question_2,cdist_cb,cdist_eu,cdist_cb_full,cdist_eu_full,cos_sim_mLM,cdist_cb_scl,cdist_eu_scl,cdist_cb_full_scl,cdist_eu_full_scl,cdist_cb_scl.1,cdist_eu_scl.1,cdist_cb_full_scl.1,cdist_eu_full_scl.1
count,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0
mean,6.037402,0.5,99.858924,110.334646,60.827428,65.508202,3.633939,1.056634,5.062471,1.117168,0.668521,0.459585,0.747153,0.503752,0.76526,0.459585,0.747153,0.503752,0.76526
std,3.084721,0.500082,45.64073,63.029785,29.218568,37.832399,1.411628,0.23379,1.605644,0.192646,0.164099,0.178529,0.165315,0.165559,0.152238,0.178529,0.165315,0.165559,0.152238
min,1.0,0.0,20.0,15.0,8.0,3.0,0.0,0.0,0.17693,0.14879,0.000157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,0.0,58.0,62.0,36.0,37.0,2.625003,0.910502,3.925967,1.002222,0.561743,0.331985,0.643822,0.386566,0.674424,0.331985,0.643822,0.386566,0.674424
50%,6.0,0.5,94.0,95.0,57.0,56.0,3.604927,1.089296,5.069853,1.144142,0.686542,0.455916,0.770249,0.504514,0.786576,0.455916,0.770249,0.504514,0.786576
75%,9.0,1.0,144.0,146.0,83.25,86.0,4.659162,1.233493,6.250337,1.258669,0.79407,0.589245,0.872212,0.626234,0.877081,0.589245,0.872212,0.626234,0.877081
max,11.0,1.0,255.0,404.0,191.0,257.0,7.906999,1.414214,9.875228,1.414214,0.983748,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
column_to_translate = ['question_1','question_2']
translated_columns = ['rus_question_1','rus_question_2']
train_df[translated_columns+column_to_translate+['label']].head()

Unnamed: 0,rus_question_1,rus_question_2,question_1,question_2,label
0,Через сколько часов я смогу выпить антибиотик?,"У меня сегодня вечеринка, и я принял последнюю дозу азитромицина этим утром. Можно мне выпить?",After how many hour from drinking an antibiotic can I drink alcohol?,I have a party tonight and I took my last dose of Azithromycin this morning. Can I have a few drinks?,1
1,Через сколько часов я смогу выпить антибиотик?,"Утром меня вырвало, и я не уверен, что это побочный эффект моего антибиотика или алкоголя, который я выпил вчера вечером...",After how many hour from drinking an antibiotic can I drink alcohol?,I vomited this morning and I am not sure if it is the side effect of my antibiotic or the alcohol I took last night...,0
2,"Я уже больше веса (192,9) для моего возраста (39)?","У меня 39 й/о самца в настоящее время весит около 193 фунтов. Думаешь, у меня лишний вес?",Am I over weight (192.9) for my age (39)?,I am a 39 y/o male currently weighing about 193 lbs. Do you think I am overweight?,1
3,"Я уже больше веса (192,9) для моего возраста (39)?",Какая диета может похудеть? Кето или веган?,Am I over weight (192.9) for my age (39)?,What diet is good for losing weight? Keto or vegan?,0
4,Аллергия на аспирин - стоит ли заполучить браслет?,"Сколько аспирин я могу выдержать от головной боли, не вызывая никаких побочных эффектов?",Aspirin allergy - is it worth getting a bracelet?,How much Aspirin can I take for my headache without causing any side effects?,0


In [16]:
corpus = pd.concat([train_df['question_1_processed'],train_df['question_2_processed']]).drop_duplicates(keep='first')
corpus.head(), corpus.shape

(0                                                                            many hour drink antibiotic drink alcohol
 2                                                                                                 weight 192 9 age 39
 4                                                                                aspirin allergy - worth get bracelet
 6                          doctor visit hit head box wall contain hazardous material use syrinx use needle get infect
 8    antibiotic 4 top high tooth dentist cld get needle 2 freeze 2 extract gum really hurt say tissue hve 2 go bk plz
 dtype: object,
 (4562,))

In [17]:
corpus_full = pd.concat([train_df['question_1'],train_df['question_2']]).drop_duplicates(keep='first')
corpus_full.head(), corpus_full.shape

(0                                                                                    After how many hour from drinking an antibiotic can I drink alcohol?
 2                                                                                                               Am I over weight (192.9) for my age (39)?
 4                                                                                                       Aspirin allergy - is it worth getting a bracelet?
 6      At a doctor's visit, I hit my head against a box on the wall containing hazardous materials (used syringes, used needles...). Will I get infected?
 8    Been on antibiotics 4 5wks top high tooth dentist cld not get needle 2 freeze 2 extract in gum really hurt she said its the tissues hve 2 go bk? Plz
 dtype: object,
 (4567,))

In [18]:
tf_idf_vectorizer = TfidfVectorizer()

In [19]:
tf_idf_corpus = tf_idf_vectorizer.fit_transform(corpus)
tf_idf_q1 = tf_idf_vectorizer.transform(train_df['question_1_processed'])
tf_idf_q2 = tf_idf_vectorizer.transform(train_df['question_2_processed'])
tf_idf_q1_array = tf_idf_q1.toarray()
tf_idf_q2_array = tf_idf_q2.toarray()
tf_idf_corpus_array = tf_idf_corpus.toarray()

In [20]:
tf_idf_corpus_full = tf_idf_vectorizer.fit_transform(corpus_full)
tf_idf_q1_full = tf_idf_vectorizer.transform(train_df['question_1'])
tf_idf_q2_full = tf_idf_vectorizer.transform(train_df['question_2'])
tf_idf_q1_full_array = tf_idf_q1_full.toarray()
tf_idf_q2_full_array = tf_idf_q2_full.toarray()
tf_idf_corpus_full_array = tf_idf_corpus_full.toarray()

In [21]:
#tf_idf_vectorizer.vocabulary_

In [22]:
tf_idf_corpus_array.shape

(4562, 5004)

In [23]:
tf_idf_q1_ = pd.Series(tf_idf_q1_array.tolist())
tf_idf_q2_ = pd.Series(tf_idf_q2_array.tolist())
train_df['tf_idf_q1'] = tf_idf_q1_
train_df['tf_idf_q2'] = tf_idf_q2_
train_df['cdist_cb'] = [cdist([row['tf_idf_q1']], [row['tf_idf_q2']],'cityblock')[0][0] for _, row in train_df.iterrows()]
train_df['cdist_eu'] = [cdist([row['tf_idf_q1']], [row['tf_idf_q2']],'euclidean')[0][0] for _, row in train_df.iterrows()]

tf_idf_q1_full_ = pd.Series(tf_idf_q1_full_array.tolist())
tf_idf_q2_full_ = pd.Series(tf_idf_q2_full_array.tolist())
train_df['tf_idf_q1_full'] = tf_idf_q1_full_
train_df['tf_idf_q2_full'] = tf_idf_q2_full_
train_df['cdist_cb_full'] = [cdist([row['tf_idf_q1_full']], [row['tf_idf_q2_full']],'cityblock')[0][0] for _, row in train_df.iterrows()]
train_df['cdist_eu_full'] = [cdist([row['tf_idf_q1_full']], [row['tf_idf_q2_full']],'euclidean')[0][0] for _, row in train_df.iterrows()]

In [24]:
train_df[['question_1','question_2','label','cdist_cb','cdist_eu','cdist_cb_full','cdist_eu_full']].head()#.query('question_1 == @s')

Unnamed: 0,question_1,question_2,label,cdist_cb,cdist_eu,cdist_cb_full,cdist_eu_full
0,After how many hour from drinking an antibiotic can I drink alcohol?,I have a party tonight and I took my last dose of Azithromycin this morning. Can I have a few drinks?,1,4.174792,1.234628,6.473491,1.403838
1,After how many hour from drinking an antibiotic can I drink alcohol?,I vomited this morning and I am not sure if it is the side effect of my antibiotic or the alcohol I took last night...,0,3.79229,1.212119,6.186609,1.242795
2,Am I over weight (192.9) for my age (39)?,I am a 39 y/o male currently weighing about 193 lbs. Do you think I am overweight?,1,3.886488,1.231677,4.704204,1.221158
3,Am I over weight (192.9) for my age (39)?,What diet is good for losing weight? Keto or vegan?,0,3.759526,1.345225,4.581715,1.335119
4,Aspirin allergy - is it worth getting a bracelet?,How much Aspirin can I take for my headache without causing any side effects?,0,3.968279,1.242348,4.974074,1.274626


In [25]:
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

embeddings1 = model.encode(train_df['question_1'].tolist())
embeddings2 = model.encode(train_df['question_2'].tolist())
train_df['question_1_mLM'] = embeddings1.tolist()
train_df['question_2_mLM'] = embeddings2.tolist()
train_df['cos_sim_mLM'] = train_df.apply(lambda x: calculate_cosine_similarity(x['question_1'], x['question_2']), axis=1)

In [29]:
train_df[['question_1','question_2','label','cdist_cb','cdist_eu','cos_sim_mLM']].query('label == 1').sort_values(by='cos_sim_mLM', ascending=False).tail(5)

Unnamed: 0,question_1,question_2,label,cdist_cb,cdist_eu,cos_sim_mLM
626,How can I deal with anxeity without medication?,I have been dealing with bad anxiety for the past few weeks and I do not want to take any medicine. Are there any remedies or simple things that I can try to feel better?,1,4.998515,1.307715,0.314939
151,Bright red blood on TP after first wipe after BM; then other wipes no more blood. I drink way to much pop and very little water...if that matters. ?,What does blood in stool mean?,1,4.674102,1.300256,0.272176
1284,"My bm aren't solid but not quite loose. Looks more like for lack of better word ""shredded"" the why is this?",What causes stringy and shredded stools?,1,4.991187,1.414214,0.269483
47,How long does it take for herpes to break out?,I had unprotected sex on the 7th of this month. Is there a specific time that the first outbreak occurs?,1,4.677719,1.414214,0.231232
1483,If your number two is hard and dark what does that mean?,What can cause a hard and dark stool?,1,2.325646,0.978183,0.170247


In [30]:
train_df[['question_1','question_2','label','cdist_cb','cdist_eu','cos_sim_mLM']].query('label == 1').sort_values(by='cos_sim_mLM', ascending=False).head(5)

Unnamed: 0,question_1,question_2,label,cdist_cb,cdist_eu,cos_sim_mLM
2430,What are some really good ways to stimulate adrenaline?,Are there any good ways to stimulate adrenaline?,1,0.542992,0.396806,0.983748
2762,Are monochorionic-diamniotic twins usually healthy at birth?,Are monochorionic-diamniotic twins healthy at the time of birth?,1,0.667612,0.43532,0.980839
2154,What are the common symptoms of bruises?,Describe some common symptoms of bruises.,1,1.054575,0.689815,0.974392
1957,Can seroquel (quetiapine) cause gerd?,Is seroquel (quetiapine) known to cause GERD?,1,0.347138,0.276526,0.97375
1154,How are batteries chosen to be put in pacemakers?,How are the batteries selected that are put into pacemakers?,1,1.120431,0.760559,0.972377


In [31]:
train_df.describe()

Unnamed: 0,dr_id,label,len_question_1,len_question_2,text_len_question_1,text_len_question_2,cdist_cb,cdist_eu,cdist_cb_full,cdist_eu_full,cos_sim_mLM,cdist_cb_scl,cdist_eu_scl,cdist_cb_full_scl,cdist_eu_full_scl,cdist_cb_scl.1,cdist_eu_scl.1,cdist_cb_full_scl.1,cdist_eu_full_scl.1
count,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0
mean,6.037402,0.5,99.858924,110.334646,60.827428,65.508202,3.633939,1.056634,5.062471,1.117168,0.668521,0.459585,0.747153,0.503752,0.76526,0.459585,0.747153,0.503752,0.76526
std,3.084721,0.500082,45.64073,63.029785,29.218568,37.832399,1.411628,0.23379,1.605644,0.192646,0.164099,0.178529,0.165315,0.165559,0.152238,0.178529,0.165315,0.165559,0.152238
min,1.0,0.0,20.0,15.0,8.0,3.0,0.0,0.0,0.17693,0.14879,0.000157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,0.0,58.0,62.0,36.0,37.0,2.625003,0.910502,3.925967,1.002222,0.561743,0.331985,0.643822,0.386566,0.674424,0.331985,0.643822,0.386566,0.674424
50%,6.0,0.5,94.0,95.0,57.0,56.0,3.604927,1.089296,5.069853,1.144142,0.686542,0.455916,0.770249,0.504514,0.786576,0.455916,0.770249,0.504514,0.786576
75%,9.0,1.0,144.0,146.0,83.25,86.0,4.659162,1.233493,6.250337,1.258669,0.79407,0.589245,0.872212,0.626234,0.877081,0.589245,0.872212,0.626234,0.877081
max,11.0,1.0,255.0,404.0,191.0,257.0,7.906999,1.414214,9.875228,1.414214,0.983748,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [32]:
scaler_cols = ['cdist_cb','cdist_eu','cdist_cb_full','cdist_eu_full']
scaled_cols = ['cdist_cb_scl','cdist_eu_scl','cdist_cb_full_scl','cdist_eu_full_scl']

scaler = MinMaxScaler()

scaled_data = pd.DataFrame(scaler.fit_transform(train_df[scaler_cols]), columns=scaled_cols)
scaled_data.describe()

Unnamed: 0,cdist_cb_scl,cdist_eu_scl,cdist_cb_full_scl,cdist_eu_full_scl
count,3048.0,3048.0,3048.0,3048.0
mean,0.459585,0.747153,0.503752,0.76526
std,0.178529,0.165315,0.165559,0.152238
min,0.0,0.0,0.0,0.0
25%,0.331985,0.643822,0.386566,0.674424
50%,0.455916,0.770249,0.504514,0.786576
75%,0.589245,0.872212,0.626234,0.877081
max,1.0,1.0,1.0,1.0


In [33]:
train_df = pd.concat([train_df,scaled_data],axis=1)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3048 entries, 0 to 3047
Data columns (total 37 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   dr_id                 3048 non-null   int64  
 1   question_1            3048 non-null   object 
 2   question_2            3048 non-null   object 
 3   label                 3048 non-null   int64  
 4   question_1_processed  3048 non-null   object 
 5   question_2_processed  3048 non-null   object 
 6   len_question_1        3048 non-null   int64  
 7   len_question_2        3048 non-null   int64  
 8   text_len_question_1   3048 non-null   int64  
 9   text_len_question_2   3048 non-null   int64  
 10  rus_question_1        3048 non-null   object 
 11  rus_question_2        3048 non-null   object 
 12  rusgl_question_1      3048 non-null   object 
 13  rusgl_question_2      3048 non-null   object 
 14  tf_idf_q1             3048 non-null   object 
 15  tf_idf_q2             3048

In [34]:
train_df.query('label == 0').describe()

Unnamed: 0,dr_id,label,len_question_1,len_question_2,text_len_question_1,text_len_question_2,cdist_cb,cdist_eu,cdist_cb_full,cdist_eu_full,...,cdist_cb_full_scl,cdist_eu_full_scl,cdist_cb_scl.1,cdist_eu_scl.1,cdist_cb_full_scl.1,cdist_eu_full_scl.1,cdist_cb_scl,cdist_eu_scl,cdist_cb_full_scl.2,cdist_eu_full_scl.2
count,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,...,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0
mean,6.037402,0.0,99.858924,102.805774,60.827428,61.537402,4.086813,1.147904,5.569258,1.199737,...,0.556008,0.83051,0.51686,0.811691,0.556008,0.83051,0.51686,0.811691,0.556008,0.83051
std,3.085227,0.0,45.648221,63.587674,29.223364,38.311359,1.345463,0.192019,1.49717,0.151092,...,0.154375,0.1194,0.170161,0.135778,0.154375,0.1194,0.170161,0.135778,0.154375,0.1194
min,1.0,0.0,20.0,15.0,8.0,3.0,0.0,0.0,1.183838,0.46568,...,0.103823,0.250422,0.0,0.0,0.103823,0.250422,0.0,0.0,0.103823,0.250422
25%,3.0,0.0,58.0,54.0,36.0,33.0,3.126852,1.032858,4.51147,1.116908,...,0.446938,0.765055,0.395454,0.730341,0.446938,0.765055,0.395454,0.730341,0.446938,0.765055
50%,6.0,0.0,94.0,84.0,57.0,50.0,4.106692,1.183101,5.637315,1.223739,...,0.563025,0.849477,0.519374,0.836579,0.563025,0.849477,0.519374,0.836579,0.563025,0.849477
75%,9.0,0.0,144.0,139.0,83.25,81.25,5.042316,1.291673,6.659042,1.308157,...,0.668376,0.916189,0.637703,0.91335,0.668376,0.916189,0.637703,0.91335,0.668376,0.916189
max,11.0,0.0,255.0,370.0,191.0,254.0,7.906999,1.414214,9.875228,1.414214,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [35]:
train_df.query('label == 1').describe()

Unnamed: 0,dr_id,label,len_question_1,len_question_2,text_len_question_1,text_len_question_2,cdist_cb,cdist_eu,cdist_cb_full,cdist_eu_full,...,cdist_cb_full_scl,cdist_eu_full_scl,cdist_cb_scl.1,cdist_eu_scl.1,cdist_cb_full_scl.1,cdist_eu_full_scl.1,cdist_cb_scl,cdist_eu_scl,cdist_cb_full_scl.2,cdist_eu_full_scl.2
count,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,...,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0
mean,6.037402,1.0,99.858924,117.863517,60.827428,69.479003,3.181065,0.965363,4.555684,1.034599,...,0.451497,0.70001,0.40231,0.682615,0.451497,0.70001,0.40231,0.682615,0.451497,0.70001
std,3.085227,0.0,45.648221,61.573382,29.223364,36.935147,1.328896,0.236239,1.549952,0.194361,...,0.159817,0.153593,0.168066,0.167046,0.159817,0.153593,0.168066,0.167046,0.159817,0.153593
min,1.0,1.0,20.0,20.0,8.0,11.0,0.0,0.0,0.17693,0.14879,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,1.0,58.0,70.0,36.0,42.0,2.237006,0.820551,3.452167,0.921481,...,0.337713,0.610619,0.282915,0.580217,0.337713,0.610619,0.282915,0.580217,0.337713,0.610619
50%,6.0,1.0,94.0,105.0,57.0,62.0,3.119905,0.994338,4.521077,1.053182,...,0.447929,0.714695,0.394575,0.703103,0.447929,0.714695,0.394575,0.703103,0.447929,0.714695
75%,9.0,1.0,144.0,153.0,83.25,89.0,4.089109,1.135938,5.7054,1.17396,...,0.570045,0.81014,0.517151,0.803229,0.570045,0.81014,0.517151,0.803229,0.570045,0.81014
max,11.0,1.0,255.0,404.0,191.0,257.0,7.079948,1.414214,9.481837,1.414214,...,0.959437,1.0,0.895403,1.0,0.959437,1.0,0.895403,1.0,0.959437,1.0


In [36]:
def accuracy_topN(question_1, question_2, def_predict, metric_=None, N=5):

    corr_pred = 0
    all_pred = 0
    
    question_1_array = np.vstack(question_1)
    question_2_array = np.vstack(question_2)

    if def_predict=='cdist_':
        predict_1 = np.array([cdist(question_2_array, np.expand_dims(q1, axis=0), metric=metric_).reshape(-1) for q1 in question_1_array])
        predict_2 = np.array([cdist(question_1_array, np.expand_dims(q2, axis=0), metric=metric_).reshape(-1) for q2 in question_2_array])
    elif def_predict=='cos_sim_':
        predict_1 = np.array([util.cos_sim(question_2_array, q1).reshape(-1) for q1 in question_1_array])
        predict_2 = np.array([util.cos_sim(question_1_array, q2).reshape(-1) for q2 in question_2_array])
        
    for i in range(len(question_1_array)):
        values_top = np.argsort(predict_1[i])[::-1][:N]  
        if i in values_top:
            corr_pred += 1
        all_pred += 1
    
    for i in range(len(question_2_array)):
        values_top = np.argsort(predict_2[i])[::-1][:N]
        if i in values_top:
            corr_pred += 1
        all_pred += 1
    
    accuracy_top = corr_pred / all_pred
    return accuracy_top


In [37]:
accuracy_top5_cos_sim = accuracy_topN(train_df['question_1_mLM'], train_df['question_2_mLM'], def_predict='cos_sim_', N=5)
accuracy_top10_cos_sim = accuracy_topN(train_df['question_1_mLM'], train_df['question_2_mLM'], def_predict='cos_sim_', N=10)
accuracy_top5_cos_sim, accuracy_top10_cos_sim

(0.7791994750656168, 0.8315288713910761)

In [38]:
accuracy_top5_cos_sim_tf = accuracy_topN(train_df['tf_idf_q1'], train_df['tf_idf_q2'], def_predict='cos_sim_', N=5)
accuracy_top10_cos_sim_tf = accuracy_topN(train_df['tf_idf_q1'], train_df['tf_idf_q2'], def_predict='cos_sim_', N=10)
accuracy_top5_cos_sim_tf, accuracy_top10_cos_sim_tf

(0.7624671916010499, 0.8157808398950132)

In [35]:
accuracy_top5_cdist_cb = accuracy_topN(train_df['question_1_mLM'], train_df['question_2_mLM'], def_predict='cdist_', metric_='cityblock', N=5)
accuracy_top10_cdist_cb = accuracy_topN(train_df['question_1_mLM'], train_df['question_2_mLM'], def_predict='cdist_', metric_='cityblock', N=10)
accuracy_top5_cdist_cb, accuracy_top10_cdist_cb

(0.0, 0.0)

In [36]:
accuracy_top5_cdist_eu = accuracy_topN(train_df['question_1_mLM'], train_df['question_2_mLM'], def_predict='cdist_', metric_='euclidean', N=5)
accuracy_top10_cdist_eu = accuracy_topN(train_df['question_1_mLM'], train_df['question_2_mLM'], def_predict='cdist_', metric_='euclidean', N=10)
accuracy_top5_cdist_eu, accuracy_top10_cdist_eu

(0.0, 0.0)

In [37]:
# accuracy_top5_cdist_cb_tf = accuracy_topN(train_df['tf_idf_q1'], train_df['tf_idf_q2'], def_predict='cdist_', metric_='cityblock', N=5)
# accuracy_top10_cdist_cb_tf = accuracy_topN(train_df['tf_idf_q1'], train_df['tf_idf_q2'], def_predict='cdist_', metric_='cityblock', N=10)
# accuracy_top5_cdist_cb_tf, accuracy_top10_cdist_cb_tf

# accuracy_top5_cdist_eu_tf = accuracy_topN(train_df['tf_idf_q1'], train_df['tf_idf_q2'], def_predict='cdist_', metric_='euclidean', N=5)
# accuracy_top10_cdist_eu_tf = accuracy_topN(train_df['tf_idf_q1'], train_df['tf_idf_q2'], def_predict='cdist_', metric_='euclidean', N=10)
# accuracy_top5_cdist_eu_tf, accuracy_top10_cdist_eu_tf

# accuracy_top5_cos_sim_tf_full = accuracy_topN(train_df['tf_idf_q1_full'], train_df['tf_idf_q2_full'], def_predict='cos_sim_', N=5)
# accuracy_top10_cos_sim_tf_full = accuracy_topN(train_df['tf_idf_q1_full'], train_df['tf_idf_q2_full'], def_predict='cos_sim_', N=10)
# accuracy_top5_cos_sim_tf_full, accuracy_top10_cos_sim_tf_full

# accuracy_top5_cdist_cb_tf_full = accuracy_topN(train_df['tf_idf_q1_full'], train_df['tf_idf_q2_full'], def_predict='cdist_', metric_='cityblock', N=5)
# accuracy_top10_cdist_cb_tf_full = accuracy_topN(train_df['tf_idf_q1_full'], train_df['tf_idf_q2_full'], def_predict='cdist_', metric_='cityblock', N=10)
# accuracy_top5_cdist_cb_tf_full, accuracy_top10_cdist_cb_tf_full

# accuracy_top5_cdist_eu_tf_full = accuracy_topN(train_df['tf_idf_q1_full'], train_df['tf_idf_q2_full'], def_predict='cdist_', metric_='euclidean', N=5)
# accuracy_top10_cdist_eu_tf_full = accuracy_topN(train_df['tf_idf_q1_full'], train_df['tf_idf_q2_full'], def_predict='cdist_', metric_='euclidean', N=10)
# accuracy_top5_cdist_eu_tf_full, accuracy_top10_cdist_eu_tf_full

In [44]:
questions = pd.concat([train_df['question_1'],train_df['question_2']],axis=0).drop_duplicates(keep='first')
questions_embeddings = model.encode(questions.tolist())

In [49]:
from deep_translator import GoogleTranslator
translator = GoogleTranslator(source='en', target='ru')
tq = translator.translate(q)
tq

'Сколько минут мне нужно тренироваться, чтобы похудеть?'

In [53]:
q = questions.sample().values[0]

inputs = tokenizer.encode(q, return_tensors="pt")
outputs = model_translate.generate(inputs, max_length=1000, num_beams=5, early_stopping=False)
translated_question = tokenizer.decode(outputs[0], skip_special_tokens=True)
translated_question_go = translator.translate(q)
question_embedding = model.encode(q)
predict = np.array([util.cos_sim(questions_embeddings, question_embedding)])

N = 11
values_top = np.argsort(predict.reshape(-1))[::-1][:N+1]

df_ = pd.DataFrame(columns=['questions', 'translated_questions','translated_questions_go','cos_sim']) 
for val in values_top:
    if questions.tolist()[val] == q:
        pass        
    else: 
        q_ = questions.tolist()[val]
        cos_sim_ = predict.reshape(-1)[val].round(1)
        translated_q_ = translate_text(q_)
        translated_q_go = translator.translate(q_)
        df_.loc[val] = [q_,translated_q_,translated_q_go,cos_sim_]

print(q, translated_question, translated_question_go)
display(df_[['questions', 'translated_questions','translated_questions_go', 'cos_sim']])

Would the type of bra I wear be causing my nipples to get more prominent? Может ли тот лифчик, который я ношу, заставит мои соски стать более заметными? Будет ли тип бюстгальтера, который я ношу, причиной того, что мои соски станут более выступающими?


Unnamed: 0,questions,translated_questions,translated_questions_go,cos_sim
3021,Is it normal for my nipples to get more prominent if I am feeling cold?,"Это нормально, что мои соски становятся более заметными, если я чувствую себя холодным?","Нормально ли, что мои соски становятся более заметными, если мне холодно?",0.8
2050,"My nipples are itching, could I be pregnant?","Мои соски чешутся, могу я быть беременна?","Соски чешутся, могу ли я быть беременна?",0.7
128,I am 44 and my breast and nipples are tender what could it be?,"Мне 44 года, а моя грудь и соски нежны, что это может быть?","Мне 44 года, и моя грудь и соски болезненные, что это может быть?",0.7
751,I have big boobs and my nipples are not out. My nipples comes out when i'm cold or when i touch them. Is that normal for my nipples to pop out?,"У меня большие сиськи и мои соски не вырвались. Мои соски выходят, когда мне холодно или когда я их трогаю. Это нормально для моих сосок?","У меня большая грудь и соски не выступающие. Мои соски выходят наружу, когда мне холодно или когда я к ним прикасаюсь. Это нормально, что мои соски выскакивают?",0.7
265,When after you concieve can your nipples start itching?,"Когда после того, как ты поймёшь, твои соски начнут чесаться?",Когда после зачатия могут начать чесаться соски?,0.6
2008,What are the dangers of getting breast implants for bigger breasts?,Какая опасность получить имплантаты груди для больших грудей?,Каковы опасности установки грудных имплантатов для увеличения груди?,0.6
1068,"Sore nipples remains during pregnancy, is this normal?","Во время беременности сосочки остаются, это нормально?","Боль в сосках остается при беременности, это нормально?",0.6
2418,Do inverted nipples pose a health problem?,Не вызывают ли перевернутые соски проблемы со здоровьем?,Опасны ли втянутые соски для здоровья?,0.6
3997,"I'm currenty pregnant and notice that my nipple has grown big, is more dark and has raised bumps around. Is this normal?","Я сейчас беременна и замечаю, что мой сосок вырос, стал темнее и поднял удары вокруг. Это нормально?","Я сейчас беременна и замечаю, что мой сосок увеличился, стал более темным и вокруг него появились шишки. Это нормально?",0.6
3641,How can I get rid of sore nipples as a side effect of BC?,"Как я могу избавиться от боли в сосках, как от побочного эффекта в Британской Колумбии?",Как избавиться от болезненности сосков как побочного эффекта РМЖ?,0.6


In [46]:
train_df.query('(question_1 == @q|question_2 == @q)')[['question_1','question_2','label','cos_sim_mLM']]

Unnamed: 0,question_1,question_2,label,cos_sim_mLM
1479,"If I only have 10 minutes to work out, what can I do?",How many minutes should I work out to lose weight?,0,0.693819


In [41]:
train_df.to_csv(f'../{DATA_DIR}/train_df_processed.csv')
train_df = pd.read_csv(f'../{DATA_DIR}/train_df_processed.csv', index_col=[0])

In [42]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3048 entries, 0 to 3047
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   dr_id                 3048 non-null   int64  
 1   question_1            3048 non-null   object 
 2   question_2            3048 non-null   object 
 3   label                 3048 non-null   int64  
 4   question_1_processed  3048 non-null   object 
 5   question_2_processed  3048 non-null   object 
 6   len_question_1        3048 non-null   int64  
 7   len_question_2        3048 non-null   int64  
 8   text_len_question_1   3048 non-null   int64  
 9   text_len_question_2   3048 non-null   int64  
 10  rus_question_1        3048 non-null   object 
 11  rus_question_2        3048 non-null   object 
 12  rusgl_question_1      3048 non-null   object 
 13  rusgl_question_2      3048 non-null   object 
 14  tf_idf_q1             3048 non-null   object 
 15  tf_idf_q2             3048