In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from UDCUtils import UDCUtils

utils = UDCUtils()

In [None]:
from pinecone import Pinecone, ServerlessSpec

In [None]:
pinecone_api_key = utils.get_pinecone_api_key()
print(pinecone_api_key)

In [None]:
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader
from torch import nn
from tqdm import tqdm

In [None]:
index_name = utils.create_dlai_index_name("idx-log1")
print(index_name)

In [None]:
pinecone = Pinecone(
    api_key=pinecone_api_key
)

In [None]:
pinecone.create_index(
    name=index_name,
    dimension=256,
    metric='cosine',
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

INDEX = pinecone.Index(index_name)

In [None]:
#!wget -q --show-progress -O training.tar.zip "https://www.dropbox.com/scl/fi/rihfngx4ju5pzjzjj7u9z/lesson6.tar.zip?rlkey=rct9a9bo8euqgshrk8wiq2orh&dl=1"

#!tar -xzvf training.tar.zip

#!tar -xzvf lesson6.tar

In [None]:
!ls -lrt training.txt sample.log

In [None]:
!head -5 sample.log

In [None]:
!head -5 training.txt

### Check cuda and setup the model

In [None]:
import torch
device = 'cuda' if torch.cuda.is_available() == True else 'cpu'
print (device)

In [None]:
#BERT model for word embeddings
from sentence_transformers import InputExample, util, losses, models
#1
word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=768)
#2
pooling_model = models.Pooling(word_embedding_dimension=word_embedding_model.get_word_embedding_dimension())
#3
dense_model = models.Dense(
    in_features=pooling_model.get_sentence_embedding_dimension(),
    out_features=256,
    activation_function=nn.Tanh()
)
#model
#model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model], device=device)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model], device="mps") #change device to mps for training the model

In [None]:
train_examples = []

with open('training.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        if line:
            a,b, label = line.split('^')
            #print(a,b,label)
            train_examples.append(InputExample(texts=[a,b], label=float(label)))

In [None]:
len(train_examples)

In [None]:
#import pandas as pd
#df = pd.DataFrame(train_examples)
#df.head()

In [None]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss_fn = losses.CosineSimilarityLoss(model)

In [None]:
#pip install accelerate -U

In [None]:
import pickle
from sentence_transformers import SentenceTransformerTrainingArguments, SentenceTransformerTrainer
from datasets import Dataset

load_pretrained_model = True

warmup_steps=100

train_dataset = Dataset.from_pandas(df)

if load_pretrained_model:
    trained_model_file = open('pickle/pretrained_model','rb')
    db = pickle.load(trained_model_file)
    trained_model_file.close()
else:
    model.fit(train_objectives=[(train_dataloader, train_loss_fn)], epochs=16, warmup_steps=warmup_steps)

In [None]:
help(model.fit)

In [None]:
samples = []

with open('sample.log', "r") as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        if line:
            samples.append(line)

In [None]:
len(samples)

### Create embeddings and upsert to pinecone

In [None]:
emb = model.encode(samples)

prepped = []

for i in tqdm(range(len(samples))):
    v = {'id':f'{i}', 'values':emb[i].tolist() ,'metadata':{'log':samples[i]}}
    prepped.append(v)

print(len(prepped))

In [None]:
INDEX.upsert(prepped)

In [None]:
good_log_line = samples[0]
print(good_log_line)

In [None]:
from time import sleep

In [None]:
results = []

while len(results)==0:
    sleep(2)
    response = INDEX.query(
        vector=emb[0].tolist(),
        top_k=100,
        include_metadata=True
    )
    results = response["matches"]
    print(".:. ",end="")

In [None]:
#print(results)

In [None]:
for i in range(0,10) :
  print(f"{round(results[i]['score'], 4)}\t{results[i]['metadata']['log']}")

In [None]:
last_line = len(results)-1

print(f"{round(results[last_line]['score'])}\t{results[last_line]['metadata']['log']}")