In [189]:
from state_of_the_art.insight_extractor.insiths_table import InsightsTable
import numpy as np


df = InsightsTable().read()
df = df[df['score'].notnull()]

rows_number=len(df) 
test_size = int(rows_number*0.3)
train_size = rows_number - test_size

train = df.sample(train_size, random_state=42)
df_all = df.merge(train.drop_duplicates(), on=['tdw_uuid','tdw_uuid'],
                   how='left', indicator=True, suffixes=('', '_y'))

#create DataFrame with rows that exist in first DataFrame only
test = df_all[df_all['_merge'] == 'left_only']
test = test[[c for c in test.columns if not c.endswith('_y')]]
# drop merge column
test = test.drop(columns=['_merge'])
train, test

(                                             insight  \
 0  Randomized experiments and instrumental variab...   
 0  AI's environmental impact, including energy co...   
 0  The distribution of bugs in LLM-generated code...   
 0  The proposed model combines machine learning a...   
 0  The deployment of AI systems in safety-critica...   
 0  The paper introduces a new task called causal ...   
 0  The paper demonstrates that expected marginal ...   
 0  This paper delves into improving the reasoning...   
 0  Traditional Marketing Mix Modeling (MMM) metho...   
 0  The proposed CausalMMM model integrates Grange...   
 0  Traditional methods of measuring advertising e...   
 0  The main contribution of this paper is a metho...   
 0  This paper aims to enhance Marketing Mix Model...   
 0  It addresses key challenges such as causal het...   
 
                            paper_id  score              tdw_timestamp  \
 0  https://arxiv.org/abs/2208.12809    1.0 2024-07-31 22:49:46.78641

In [190]:
import numpy as np


train_y = train['score'].astype('float32').to_numpy()
test_y =  test['score'].astype('float32').to_numpy()
train_y

array([1., 0., 3., 1., 0., 1., 3., 0., 1., 1., 3., 3., 3., 1.],
      dtype=float32)

In [191]:
train_y
test_y

array([0., 0., 3., 3., 0.], dtype=float32)

In [192]:

from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-mpnet-base-v2")

# The sentences to encode
# 2. Calculate embeddings by calling model.encode()
train_embedding = embeddings = model.encode(train['insight'].to_list())
test_embedding = embeddings = model.encode(test['insight'].to_list())
train_embedding[0]

array([ 4.10297811e-02,  1.28135458e-01, -4.02851291e-02, -9.14486274e-02,
        4.14522272e-03, -5.72751567e-04,  8.37314874e-02, -2.41287965e-02,
        2.55535543e-02, -2.60917023e-02, -1.61805924e-03, -2.04461999e-03,
       -5.32817608e-03,  6.34432584e-02,  6.01217290e-03, -4.75540273e-02,
        1.96169131e-02, -1.31206224e-02,  6.41749278e-02,  1.18500004e-02,
       -2.71604843e-02, -4.64620739e-02, -3.32616121e-02,  3.14585888e-03,
        4.19206060e-02, -8.10076669e-03, -1.45068644e-02, -1.08898664e-02,
       -4.08961496e-04, -8.38463232e-02,  1.15226163e-02,  7.45306090e-02,
        5.10944352e-02,  1.20021189e-02,  1.59046658e-06, -4.47418392e-02,
       -1.01978797e-02,  2.47465819e-02, -3.27477604e-02,  8.45783129e-02,
        4.11038697e-02,  6.93886057e-02,  2.91374009e-02,  2.59535313e-02,
       -2.67084944e-03,  3.23748440e-02,  2.87598427e-02, -5.36585506e-03,
       -9.91658494e-02, -3.23265465e-03,  5.45172486e-03, -3.59691889e-03,
       -5.09798229e-02, -

In [193]:

import torch
from torch import nn
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 5)
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
print(model)
print(device)

Using mps device
NeuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=5, bias=True)
  )
)
mps


In [201]:
def train_fn(dataloader, model, loss_fn, optimizer):
    size = len(dataloader)

    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X = torch.from_numpy(X).to(device)
        y = torch.from_numpy(np.array(y)).to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [202]:
def test_fn(dataloader, model, loss_fn):
    size = len(dataloader)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X = torch.from_numpy(X).to(device)
            y = torch.from_numpy(np.array(y)).to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [200]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
epochs = 2000

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_fn(list(zip(train_embedding, train_y)), model, loss_fn, optimizer)
    test_fn(list(zip(test_embedding, test_y)), model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 0.235010  [  768/   14]
Epoch 2
-------------------------------
loss: 0.234566  [  768/   14]
Epoch 3
-------------------------------
loss: 0.234216  [  768/   14]
Epoch 4
-------------------------------
loss: 0.233786  [  768/   14]
Epoch 5
-------------------------------
loss: 0.233338  [  768/   14]
Epoch 6
-------------------------------
loss: 0.232977  [  768/   14]
Epoch 7
-------------------------------
loss: 0.232372  [  768/   14]
Epoch 8
-------------------------------
loss: 0.231987  [  768/   14]
Epoch 9
-------------------------------
loss: 0.231464  [  768/   14]
Epoch 10
-------------------------------
loss: 0.231036  [  768/   14]
Epoch 11
-------------------------------
loss: 0.230670  [  768/   14]
Epoch 12
-------------------------------
loss: 0.230127  [  768/   14]
Epoch 13
-------------------------------
loss: 0.229673  [  768/   14]
Epoch 14
-------------------------------
loss: 0.229150  [  768/   14]
Epoch 15
------

In [205]:
model(torch.from_numpy(test_embedding[4]).to(device))

tensor([ 4.4054, -0.6647, -2.4064,  0.8038, -2.3884], device='mps:0',
       grad_fn=<LinearBackward0>)

In [198]:
test

Unnamed: 0,insight,paper_id,score,tdw_timestamp,tdw_uuid
6,The establishment of ethical AI systems must e...,https://arxiv.org/abs/2311.17228,0.0,2024-07-29 02:07:58.970022,251c4b1e-634e-4b0a-869b-a40521409d9e
7,Addressing ethical concerns in AI involves int...,https://arxiv.org/abs/2311.17228,0.0,2024-07-29 02:07:58.967535,1fb606c7-820b-4953-97c6-049391a8d355
10,"Media Mix Modeling (MMM), widely used for eval...",https://arxiv.org/abs/1807.03292,3.0,2024-07-28 10:50:47.039407,e9071544-026e-4ed4-931f-ebc8d24f46b3
12,LLMs are generally effective at generating cod...,https://arxiv.org/pdf/2407.06153,3.0,2024-07-28 10:46:10.454876,8ef424af-5ef0-4a6f-aba1-71b6315479cf
14,The paper posits that smaller language models ...,https://arxiv.org/abs/2407.18248,0.0,2024-07-28 03:18:07.003285,0b3b209f-fe2c-4e05-99c5-8694f8a20719


In [206]:

train_y.dtype

dtype('float32')

In [223]:
MODEL_PATH = '/Users/jean.machado/projects/state-of-the-art-via-ai/.models/model.pth'

#torch.save(model.state_dict(), MODEL_PATH)
torch.save(model.state_dict(), MODEL_PATH)


In [224]:
model2 =  NeuralNetwork()
model2.load_state_dict(torch.load(MODEL_PATH))
model2.to(device)
model2.eval()
model2(torch.from_numpy(test_embedding[4]).to(device))

  model2.load_state_dict(torch.load(MODEL_PATH))


tensor([ 4.4054, -0.6647, -2.4064,  0.8038, -2.3884], device='mps:0',
       grad_fn=<LinearBackward0>)

: 