In [None]:
import os
import random
import time

import openai
import pickle
import torch

import numpy as np
import pandas as pd
import torch.nn.functional as F
import regex as re
import torch.nn as nn
import torch.optim as optim

from openai.embeddings_utils import get_embedding
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupShuffleSplit
from tabulate import tabulate
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from typing import List


os.environ["CUDA_VISIBLE_DEVICES"] = "0"

%load_ext autoreload
%autoreload

In [None]:
# wiki_QA = pd.read_csv("wikipedia_question_similar_answer.tsv", names=["question", "wikipedia_answer"], skip_blank_lines="skip")
QA_df = pd.read_json("~/Downloads/gooaq_pairs.jsonl", lines=True)
QA_df.head()

In [None]:
embeddings = {}
for idx, row in QA_df.iloc[200:4000].iterrows():
    question = row[0]
    answer = row[1]
    embedding_from_string(string=question, embedding_cache = embeddings) 
    embedding_from_string(string=answer, embedding_cache = embeddings) 
embeddings =  {key: torch.tensor(emb) for key, emb in embeddings.items()}
embedding_cache_file = 'qa_pairs_embedding.pkl'
pickle.dump(embeddings, open(embedding_cache_file, 'wb'))

In [None]:
splitter = GroupShuffleSplit(n_splits=1, train_size=.90, random_state=42)
groups = QA_df[0]
train_indices, test_indices = next(splitter.split(QA_df, groups=groups))

train_df = QA_df.iloc[train_indices]
test_df = QA_df.iloc[test_indices]

In [None]:
train_df = adding_negative_pairs(train_df)
test_df = adding_negative_pairs(test_df)
train_label_dict = [{"query": row[0], "doc": row[1], "label": row['label']} for _, row in train_df.iterrows()]
test_label_dict = [{"query": row[0], "doc": row[1], "label": row['label']} for _, row in test_df.iterrows()]

In [2]:
#pickle.dump(train_label_dict, open('train.pkl', 'wb'))
#pickle.dump(test_label_dict, open('test.pkl', 'wb'))
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("using GPU")
else:
    device = torch.device("cpu")
embeddings = pickle.load(open('qa_pairs_embedding.pkl', 'rb'))
train_list = pickle.load(open('train.pkl', 'rb'))
test_list = pickle.load(open('test.pkl', 'rb'))

In [3]:
from nlp_resources.finetune_openai_embeddings import dataset
from nlp_resources.finetune_openai_embeddings.dataset import RetrievalDataset

train_dataloader = DataLoader(RetrievalDataset(embeddings, train_list), batch_size=32)
test_dataloader = DataLoader(RetrievalDataset(embeddings, test_list), batch_size=32)

In [4]:
from nlp_resources.finetune_openai_embeddings.models import SimilarityModel

model = SimilarityModel(embedding_dim=1536, hidden_dim=768, out_dim=256)

In [6]:
from nlp_resources.finetune_openai_embeddings.train_utils import train

train(train_dataloader, test_dataloader, device, model, num_epochs=1)


8422it [00:46, 180.37it/s]
8422it [00:13, 606.62it/s]
  if y.dtype.kind == "f" and np.any(y != y.astype(int)):


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
neutrino_wiki = """
A neutrino (/njuːˈtriːnoʊ/ new-TREE-noh; denoted by the Greek letter ν) is a fermion (an elementary particle with spin of 1/2) that interacts only via the weak interaction and gravity.[2][3] The neutrino is so named because it is electrically neutral and because its rest mass is so small (-ino) that it was long thought to be zero. The rest mass of the neutrino is much smaller than that of the other known elementary particles excluding massless particles.[1] The weak force has a very short range, the gravitational interaction is extremely weak due to the very small mass of the neutrino, and neutrinos do not participate in the strong interaction.[4] Thus, neutrinos typically pass through normal matter unimpeded and undetected.[

Weak interactions create neutrinos in one of three leptonic flavors: electron neutrinos (νe), muon neutrinos (νμ), or tau neutrinos (ντ), in association with the corresponding charged lepton.[5] Although neutrinos were long believed to be massless, it is now known that there are three discrete neutrino masses with different tiny values (the smallest of which may even be zero[6]), but the three masses do not uniquely correspond to the three flavors. A neutrino created with a specific flavor is a specific mixture of all three mass states (a quantum superposition). 

Similar to some other neutral particles, neutrinos oscillate between different flavors in flight as a consequence. For example, an electron neutrino produced in a beta decay reaction may interact in a distant detector as a muon or tau neutrino.[7][8] The three mass values are not yet known as of 2022, but laboratory experiments and cosmological observations have determined the differences of their squares,[9] an upper limit on their sum (< 2.14×10−37 kg),[1][10] and an upper limit on the mass of the electron neutrino.

For each neutrino, there also exists a corresponding antiparticle, called an antineutrino, which also has spin of 1/2 and no electric charge. Antineutrinos are distinguished from neutrinos by having opposite-signed lepton number and weak isospin, and right-handed instead of left-handed chirality. To conserve total lepton number (in nuclear beta decay), electron neutrinos only appear together with positrons (anti-electrons) or electron-antineutrinos, whereas electron antineutrinos only appear with electrons or electron neutrinos.[12][13]

Neutrinos are created by various radioactive decays; the following list is not exhaustive, but includes some of those processes: beta decay of atomic nuclei or hadrons,.natural nuclear reactions such as those that take place in the core of a star.artificial nuclear reactions in nuclear reactors, nuclear bombs, or particle acceleratorsduring a supernova. during the spin-down of a neutron star. when cosmic rays or accelerated particle beams strike atoms.

The majority of neutrinos which are detected about the Earth are from nuclear reactions inside the Sun. At the surface of the Earth, the flux is about 65 billion (6.5×1010) solar neutrinos, per second per square centimeter.[14][15] Neutrinos can be used for tomography of the interior of the earth.[16][17]

Machine learning (ML) is a field of inquiry devoted to understanding and building methods that "learn" – that is, methods that leverage data to improve performance on some set of tasks.[1] It is seen as a part of artificial intelligence.

Machine learning algorithms build a model based on sample data, known as training data, in order to make predictions or decisions without being explicitly programmed to do so.[2] Machine learning algorithms are used in a wide variety of applications, such as in medicine, email filtering, speech recognition, agriculture, and computer vision, where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks.[3][4]

A subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers, but not all machine learning is statistical learning. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a related field of study, focusing on exploratory data analysis through unsupervised learning.[6][7]

Some implementations of machine learning use data and neural networks in a way that mimics the working of a biological brain.[8][9]In its application across business problems, machine learning is also referred to as predictive analytics.
"""

In [None]:
test_question1 = "What are neutrinos?"
test_question2 = "How are neutrinos created?"
# sentences = [line.strip() for line in re.split(r' *[\.\n]+', sterile_neutrino_wiki) if line]
paragraphs = [line.strip() for line in re.split(r' *[\n]+', neutrino_wiki) if line]

In [None]:
len(paragraphs)

In [None]:
len(embeddings)

In [None]:
# with open(emb_path, "wb") as embedding_cache_file
embedding_cache_file = 'qa_pairs_embedding.pkl'
pickle.dump(embeddings, open(embedding_cache_file, 'wb'))

In [None]:
test_model = TwoTowerModelRecall(embedding_dim=1536, hidden_dim=768, out_dim=256).to(device)

# state_dict = torch.load("/mnt/ruian/test/wikiQA/training-0.0001-20230305-060248/335-model.pt")
state_dict = torch.load("/mnt/ruian/test/wikiQA/training-0.0001-20230305-062353/181-model.pt")

test_model.load_state_dict(state_dict)

In [None]:
def check_answer(test_question):
    test_model.eval()

#     q_emb = torch.tensor(embeddings[test_question])
    q_emb = embeddings[test_question].clone().detach()
    q_emb = q_emb.unsqueeze(0)

    rows = []
    for paragraph in paragraphs:
#         p_emb = torch.tensor(embeddings[paragraph])
        p_emb = embeddings[paragraph].clone().detach()
        p_emb = p_emb.unsqueeze(0)

        q_emb, p_emb = q_emb.to(device), p_emb.to(device)
        output = torch.sigmoid(test_model(q_emb, p_emb))

        row = {"question": test_question, "paragraph": paragraph, "score": output.item()}
        rows.append(row)
        
    qa_output = pd.DataFrame(rows)
    qa_output = qa_output.sort_values(by="score", ascending=False)
    return qa_output

In [None]:
check_answer(test_question1)
# second highest is the answer

In [None]:
check_answer(test_question2)

In [None]:
def inference_row(row):
    model.eval()
    q_emb = torch.tensor(embeddings[row['question']])
    q_emb = q_emb.unsqueeze(0)
    
    p_emb = torch.tensor(embeddings[row['wikipedia_answer']])
    p_emb = p_emb.unsqueeze(0)

    q_emb, p_emb = q_emb.to(device), p_emb.to(device)
    output = torch.sigmoid(model(q_emb, p_emb))

    return output.item()

In [None]:
# QA_df['score'] = QA_df.apply(inference_row, axis=1)

In [None]:
train_df['infer_score'] = train_df.apply(inference_row, axis=1)
test_df['infer_score'] = test_df.apply(inference_row, axis=1)

In [None]:
train_df

In [None]:
test_df

In [None]:
def inference_row(data):
    model.eval()
    q_emb = torch.tensor(embeddings[row['question']])
    q_emb = q_emb.unsqueeze(0)
    
    p_emb = torch.tensor(embeddings[row['wikipedia_answer']])
    p_emb = p_emb.unsqueeze(0)

    q_emb, p_emb = q_emb.to(device), p_emb.to(device)
    output = torch.sigmoid(model(q_emb, p_emb))

    return output.item()