In [1]:
class SparseVector:
    def __init__(self, nums: list[int]):
        self.nums = nums
        self.indices = [i for i in range(len(nums)) if nums[i] != 0]

    def dot_product(self, vec: 'SparseVector') -> int:
        result = 0
        i = 0
        j = 0
        while i < len(self.indices) and j < len(vec.indices):
            if self.indices[i] == vec.indices[j]:
                result += self.nums[self.indices[i]] * vec.nums[vec.indices[j]]
                i += 1
                j += 1
            elif self.indices[i] < vec.indices[j]:
                i += 1
            else:
                j += 1
        return result

In [2]:
nums1 = [0,0,3,40,0,0,0,5,0]
nums2 = [1,0,0,2,0,14,0,0,0]
vec1 = SparseVector(nums1)
vec2 = SparseVector(nums2)
result = vec1.dot_product(vec2)


In [3]:
result

80

In [4]:
import pandas as pd
import numpy as np

In [7]:
data = {
    "customer_global_rk": [110, 110, 110, 111, 111, 111, 112, 112, 113, 113, 113],
    "trans_dt": ["2023-01-01", "2023-01-01", "2023-01-03", "2023-01-04", "2023-01-05", "2023-01-08", "2023-01-01", "2023-01-02", "2023-01-09", "2023-01-10", "2023-01-12"],
    "trans_amt": [123, 235, 345, 654, 229, 345, 212, 334, 789, 200, 557]
}
df = pd.DataFrame(data)


Unnamed: 0,customer_global_rk,trans_dt,trans_amt
0,110,2023-01-01,123
1,110,2023-01-01,235
2,110,2023-01-03,345
3,111,2023-01-04,654
4,111,2023-01-05,229
5,111,2023-01-08,345
6,112,2023-01-01,212
7,112,2023-01-02,334
8,113,2023-01-09,789
9,113,2023-01-10,200


In [8]:
a1 = df.groupby(["customer_global_rk", "trans_dt"])["trans_amt"].sum()

In [9]:
a1

customer_global_rk  trans_dt  
110                 2023-01-01    358
                    2023-01-03    345
111                 2023-01-04    654
                    2023-01-05    229
                    2023-01-08    345
112                 2023-01-01    212
                    2023-01-02    334
113                 2023-01-09    789
                    2023-01-10    200
                    2023-01-12    557
Name: trans_amt, dtype: int64

In [37]:
data = {
    'date': ['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-01', '2022-01-02', '2022-01-03', '2022-01-02'],
    'client_id': [1,1,2,2,3,3,1],
    'amount': [100, 200, 300, 400, 500, 600, 700]
}
df = pd.DataFrame(data)
df

Unnamed: 0,date,client_id,amount
0,2022-01-01,1,100
1,2022-01-02,1,200
2,2022-01-03,2,300
3,2022-01-01,2,400
4,2022-01-02,3,500
5,2022-01-03,3,600
6,2022-01-02,1,700


In [39]:
df['date'] = pd.to_datetime(df['date'])
min_date = df['date'].min()
max_date = df['date'].max()

trans = df.groupby(['client_id', 'date'])['amount'].sum().reset_index()

trans_pivot = pd.pivot_table(trans, values='amount', index='client_id', columns='date')
trans_pivot = trans_pivot.fillna(0.0)

# Transform the table to matrix
matrix = trans_pivot.values.astype(int).tolist()
#matrix

# Add zeros for days when client doesnt have transactions
for row in matrix:
    while len(row) < (max_date - min_date).days + 1:
        row.append(0)
print(matrix)


[[100, 900, 0], [400, 0, 300], [0, 500, 600]]


In [40]:
dbf = pd.DataFrame(matrix)

In [41]:
dbf

Unnamed: 0,0,1,2
0,100,900,0
1,400,0,300
2,0,500,600


In [1]:
import pandas as pd

In [39]:
import pandas as pd
from pandas_datareader import data as pdr
import yfinance as yf
yf.pdr_override()


# Define the futures contract name and timeframe
contract = 'GC=F' 
#contract = 'UVXY'
start = '2023-01-01'
end = '2024-01-01'

# Download the data
df = pdr.get_data_yahoo(contract, start=start, end=end)

# Keep only the relevant columns  
df = df[['Open', 'High', 'Low', 'Close', 'Volume']]

# Output to CSV
#df.to_csv('gold_futures.csv')

[*********************100%***********************]  1 of 1 completed


In [40]:
import pandas as pd
import yfinance as yf

def get_data(ticker: str, start: str, end: str)-> pd.DataFrame:
    # Download the data
    #df = pdr.get_data_yahoo(tickers=ticker, start=start, end=end)
    df = yf.download(ticker, start=start, end=end)

    return df



In [41]:
df = get_data(ticker="GC=F", start="2023-01-01", end="2024-01-01")

[*********************100%***********************]  1 of 1 completed


In [42]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-03,1836.199951,1839.699951,1836.199951,1839.699951,1839.699951,29
2023-01-04,1845.599976,1859.099976,1845.599976,1852.800049,1852.800049,25
2023-01-05,1855.199951,1855.199951,1834.800049,1834.800049,1834.800049,24
2023-01-06,1838.400024,1868.199951,1835.300049,1864.199951,1864.199951,26
2023-01-09,1867.0,1880.0,1867.0,1872.699951,1872.699951,62


In [43]:
df["diff"] = df["Close"].pct_change() * 100
df = df.fillna(0.0)

In [44]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-01-03,1836.199951,1839.699951,1836.199951,1839.699951,1839.699951,29,0.0
2023-01-04,1845.599976,1859.099976,1845.599976,1852.800049,1852.800049,25,0.712078
2023-01-05,1855.199951,1855.199951,1834.800049,1834.800049,1834.800049,24,-0.971503
2023-01-06,1838.400024,1868.199951,1835.300049,1864.199951,1864.199951,26,1.602349
2023-01-09,1867.0,1880.0,1867.0,1872.699951,1872.699951,62,0.45596


In [31]:
df['diff'].min()

-2.7866212520694877

In [32]:
df['diff'].max()

3.108113950051017

In [45]:
# Define min and max for input range  
input_min = df['diff'].min()  
input_max = df['diff'].max()

# Define min and max for output range
output_min = 1
output_max = 64  

# Rescale values  
df['scaled'] = round(((df['diff'] - input_min) / 
                   (input_max - input_min)) * 
                  (output_max - output_min) + output_min)
df['scaled'] = df['scaled'].astype(int)


In [36]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,diff,scaled
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-01-03,1836.199951,1839.699951,1836.199951,1839.699951,1839.699951,29,0.0,31
2023-01-04,1845.599976,1859.099976,1845.599976,1852.800049,1852.800049,25,0.712078,38
2023-01-05,1855.199951,1855.199951,1834.800049,1834.800049,1834.800049,24,-0.971503,20
2023-01-06,1838.400024,1868.199951,1835.300049,1864.199951,1864.199951,26,1.602349,48
2023-01-09,1867.0,1880.0,1867.0,1872.699951,1872.699951,62,0.45596,36


In [46]:
# # Prepare timeseries for ML 

# import pandas as pd
# import numpy as np

# # Generate sample timeseries  
# N = 1000
# series = np.random.randint(0, 10, size=N)  

series = df['scaled']

num_steps = 64 
num_labels = 5

data = []
labels = []

# Iterate over series 
for i in range(len(series) - num_steps - num_labels + 1):
    vec = series[i:i+num_steps]
    label = series[i+num_steps:i+num_steps+num_labels]
    
    # Append to lists
    data.append(list(vec))  
    labels.append(list(label))
    
# Create dataframe    
df = pd.DataFrame()
df['vec'] = data
df['label'] = labels

print(df.head())


                                                 vec                 label
0  [31, 38, 20, 48, 36, 30, 32, 43, 44, 25, 29, 4...  [30, 26, 19, 39, 34]
1  [38, 20, 48, 36, 30, 32, 43, 44, 25, 29, 41, 3...  [26, 19, 39, 34, 47]
2  [20, 48, 36, 30, 32, 43, 44, 25, 29, 41, 33, 3...  [19, 39, 34, 47, 10]
3  [48, 36, 30, 32, 43, 44, 25, 29, 41, 33, 31, 3...  [39, 34, 47, 10, 27]
4  [36, 30, 32, 43, 44, 25, 29, 41, 33, 31, 35, 3...  [34, 47, 10, 27, 38]


In [47]:
df.to_parquet("gc.parquet", index=False)

In [49]:
df.shape

(183, 2)

In [165]:
df = pd.read_parquet("gc.parquet")

In [166]:
# 6S
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm


class SSMSelection(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        
        self.A = nn.Linear(input_dim, hidden_dim) 
        self.B = nn.Linear(input_dim, hidden_dim)
        self.C = nn.Linear(hidden_dim, 4)
        
    def forward(self, x):
        batch_size, seq_len = x.shape
        
        A = self.A(x) # (batch_size, seq_len, hidden_dim)
        B = self.B(x) # (batch_size, seq_len, hidden_dim)
        C = self.C(x) # (batch_size, seq_len, 4)
        
        h = torch.zeros(batch_size, seq_len, self.hidden_dim)
        y = torch.zeros(batch_size, seq_len, 4)
        
        for t in range(seq_len):
            C_t = C[:, t]
            C_flat = C_t.reshape(batch_size, self.hidden_dim)
            h[:,t] = A[:,t] @ h[:,t] + B[:,t] @ x[:,t] 
            y[:,t] = C_flat @ h[:,t]
            
        return y


input_dims = 64
hidden_dims = 64
output_dims = 5

arr_x = np.array(df["vec"].tolist())
arr_y = np.array(df["label"].tolist())

X_train = torch.from_numpy(arr_x.astype(np.float32))
y_train = torch.from_numpy(arr_y)

# Define and train model
model = SSMSelection(input_dims, hidden_dims)
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(100):
    y_pred = model(X_train)
    loss = ((y_pred - y_train.float())**2).mean()  
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


RuntimeError: shape '[183, 64]' is invalid for input of size 183

In [169]:
y_train.shape

torch.Size([183, 5])

In [171]:
X_train.shape

torch.Size([183, 64])

In [172]:
y_train.shape

torch.Size([183, 5])

In [178]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("google/t5-large") 
tokenizer = AutoTokenizer.from_pretrained("google/t5-large")

model.resize_token_embeddings(len(tokenizer)) 

# Prepare data
X = X_train # torch tensor (1000, 64)  
y = y_train # torch tensor (1000, 5)

dataset = list(zip(X, y))
encoded = tokenizer(X.tolist(), padding=True, truncation=True, return_tensors="pt")  

# Add sequence dimension to target
y = y[:, :, None]  

# Train
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5) 

for epoch in range(3):
  model.train()  
  for i in range(0, len(dataset), 32):
    
    batch_X, batch_y = encoded[i:i+32], y[i:i+32]    
    outputs = model(**batch_X, labels=batch_y)   
    loss = outputs.loss
    
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()


    
# Inference
# test_input = # torch tensor (1, 64)
# outputs = model.generate(test_input)  
# print(tokenizer.decode(outputs[0]))

OSError: google/t5-large is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.