In [114]:
import mlflow
import pandas as pd
import prophet
import os
import torch
import torch.nn as nn
import math
from tqdm.autonotebook import tqdm
import shutil
import numpy as np
import warnings
warnings.filterwarnings('ignore')

### Prepare new data to update models

In [15]:
new_data = pd.read_csv('./NASDAQ_100/AAPL.csv')
new_data['Date'] = pd.to_datetime(new_data['Date'])
# latest_date = df_data['Date'].tail(6).values[3]
# latest_date = ''
# with open('./containers/update/latest_date.txt', 'r') as f:
#     latest_date = f.readline()
#     f.close()
# latest_date = pd.to_datetime(latest_date)
# new_data = df_data[df_data['Date'] > latest_date]
new_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2022-01-18,171.509995,172.539993,169.410004,169.800003,168.820221,90956700
1,2022-01-19,170.000000,171.080002,165.940002,166.229996,165.270813,94815000
2,2022-01-20,166.979996,169.679993,164.179993,164.509995,163.560730,91420500
3,2022-01-21,164.419998,166.330002,162.300003,162.410004,161.472839,122848900
4,2022-01-24,160.020004,162.300003,154.699997,161.619995,160.687408,162294600
...,...,...,...,...,...,...,...
246,2023-01-10,130.259995,131.259995,128.119995,130.729996,130.729996,63896200
247,2023-01-11,131.250000,133.509995,130.460007,133.490005,133.490005,69458900
248,2023-01-12,133.880005,134.259995,131.440002,133.410004,133.410004,71379600
249,2023-01-13,132.029999,134.919998,131.660004,134.759995,134.759995,57758000


### Updating models

In [16]:
MLFLOW_IP = '91.206.15.104'
MLFLOW_PASSWORD = 'lH19Y0ImXQNg'
MLFLOW_USER = 'godder'
MLFLOW_PORT = '9899'

In [17]:
mlflow.set_tracking_uri('http://{}:{}'.format(MLFLOW_IP, MLFLOW_PORT))

In [18]:
tracking_uri = mlflow.get_tracking_uri()
registry_uri = mlflow.get_registry_uri()

client = mlflow.MlflowClient(tracking_uri=tracking_uri, registry_uri=registry_uri)

In [19]:
ticker_name = 'AAPL'

#### Prophet

In [20]:
# experiment_id = mlflow.create_experiment(
#     "Prophet update",
#     artifact_location="sftp://{}:{}@{}:/home/godder/mlflow_storage/artifacts".format(MLFLOW_USER, MLFLOW_PASSWORD, MLFLOW_IP),
#     tags={"version": ".1"},
# )

In [21]:
vers = []
for mv in client.search_model_versions("name='Prophet'"):
    vers.append(int(mv.version))
prophet_latest = max(vers)

In [22]:
exp_id = dict(mlflow.get_experiment_by_name("Prophet update"))['experiment_id']

In [23]:
mlflow.set_experiment(experiment_id=exp_id)
print("Set experiment")

Set experiment


In [26]:
mlflow.start_run(experiment_id=exp_id, run_name='Prophet updating')

<ActiveRun: >

In [27]:
old_model = mlflow.prophet.load_model(
    model_uri=f"models:/Prophet/{prophet_latest}"
)

  setattr(model, attribute, pd.Timestamp.utcfromtimestamp(model_dict[attribute]).tz_localize(None))


In [28]:
updated_model = prophet.Prophet(changepoint_prior_scale=old_model.changepoint_prior_scale, seasonality_prior_scale=old_model.seasonality_prior_scale, weekly_seasonality=old_model.weekly_seasonality)
new_df = new_data[['Date', 'Close']].copy()
new_df.rename({'Date': 'ds', 'Close': 'y'}, axis=1, inplace=True)
updated_model.fit(new_df)

00:07:58 - cmdstanpy - INFO - Chain [1] start processing
00:07:58 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x7f7106678f10>

In [29]:
future = updated_model.make_future_dataframe(periods=10)
signature = mlflow.models.signature.infer_signature(future)

In [None]:
mlflow.get

In [30]:
mlflow.prophet.log_model(updated_model,
                         'prophet',
                         registered_model_name='Prophet',
                         signature=signature,
                         input_example=future)

KeyError: '__file__'

In [171]:
mlflow.end_run()

In [172]:
version = prophet_latest + 1
client.transition_model_version_stage(
    name="Prophet",
    version=version,
    stage="Staging"
)

for ver in range(1, version):
    client.transition_model_version_stage(
        name="Prophet",
        version=ver,
        stage="Archived"
    )
print('done')

done


#### Transformer

or move each class and func to another file and make imports for beauty..

In [91]:
class TransformerDataset(torch.utils.data.Dataset):
    """
    Special dataset class for training transformer neural network on stock prices data.
    """
    def __init__(self, data, indices, enc_seq_len,
                 dec_seq_len, target_seq_len, normalize=False):
        """
        Constructor for dataset class
        Args:
            data: pd.DataFrame. Data to work with
            indices: list of tuples of indices, marking starting and ending of subsequences
            enc_seq_len: sequence length for encoder
            dec_seq_len: sequence length for decode
            target_seq_len: sequence length of target
            normalize: whether to normalize data by columns or not
        """
        self.data = data
        self.indices = indices
        self.enc_seq_len = enc_seq_len
        self.dec_seq_len = dec_seq_len
        self.target_seq_len = target_seq_len
        if normalize:
            self.data = (self.data-self.data.mean())/self.data.std()
        
    def __len__(self):
        """
        Get length of dataset class
        Returns: length of dataset

        """
        return len(self.indices)
    
    def __getitem__(self, index):
        """
        Get tuple: [src, trg, trg_y]

        Args:
            index: int, index to get item
        """
        starting_index = self.indices[index][0]
        ending_index = self.indices[index][1]
        
        sequence = self.data[starting_index:ending_index]
        
        return self.get_src_trg(sequence)
        
    def get_src_trg(self, sequence):
        """
        Get source, target, and ground truth from sequence, that's input of Transformer class.
        Args:
            sequence: sequence to divide

        Returns:

        """
        assert(len(sequence)==self.dec_seq_len+self.target_seq_len)
        
        src = sequence[:self.enc_seq_len]
        trg = sequence[self.enc_seq_len-1:len(sequence)-1]
        
        assert(len(trg) == self.target_seq_len)
        trg_y = sequence[-self.target_seq_len:]
        
        assert(len(trg_y) == self.target_seq_len)
        src = torch.tensor(src.values.astype(np.float32))
        trg = torch.tensor(trg.values.astype(np.float32))
        trg_y = torch.tensor(trg_y.values.astype(np.float32))
        return src, trg, trg_y
    
def get_src_trg(self, sequence, enc_seq_len, target_seq_len):
        """
        Get tuples of source, target and ground truth for Transformer neural network
        Args:
            sequence: sequence to split
            enc_seq_len: length of encoder sequence
            target_seq_len: length of decoder sequence

        Returns: tuple, containing source, target and ground truth
        """
        assert (len(sequence) == enc_seq_len + target_seq_len)
        
        src = sequence[:enc_seq_len] 
        
        # decoder input. As per the paper, it must have the same dimension as the 
        # target sequence, and it must contain the last value of src, and all
        # values of trg_y except the last (i.e. it must be shifted right by 1)
        trg = sequence[enc_seq_len-1:len(sequence)-1]

        trg = trg[:, 0]

        if len(trg.shape) == 1:
            trg = trg.unsqueeze(-1)
        
        assert (len(trg) == target_seq_len)

        # The target sequence against which the model output will be compared to compute loss
        trg_y = sequence[-target_seq_len:]

        trg_y = trg_y[:, 0]
        
        assert (len(trg_y) == target_seq_len)

        return src, trg, trg_y.squeeze(-1)
    
def get_indices_entire_sequence(data: pd.DataFrame, window_size, step_size):
        """
        Split data to indices for Transformer neural network learning
        Args:
            data: data to split
            window_size: size of sliding window (aka seq length)
            step_size: how many values to skip in each iteration

        Returns:
            list of indices
        """
        stop_position = len(data)-1 # 1- because of 0 indexing        
        # Start the first sub-sequence at index position 0
        subseq_first_idx = 0
        
        subseq_last_idx = window_size
        
        indices = []
        
        while subseq_last_idx <= stop_position:

            indices.append((subseq_first_idx, subseq_last_idx))
            
            subseq_first_idx += step_size
            
            subseq_last_idx += step_size

        return indices

In [35]:
exp_id = dict(mlflow.get_experiment_by_name("CompatibleTransformer"))['experiment_id']
mlflow.set_experiment(experiment_id=exp_id)
print("Set experiment")

Set experiment


In [61]:
runs = mlflow.search_runs(exp_id)
run_id = runs.where(runs['status'] == 'FINISHED').dropna().sort_values(by='end_time', ascending=False).iloc[0].run_id
if not os.path.exists('temp/'):
    os.mkdir('temp')
mlflow.artifacts.download_artifacts(run_id=run_id, dst_path='temp')
print ('...done!')

...done!


In [None]:
batch_size=10
dec_seq_len = 50
enc_seq_len = 50
output_sequence_length = 10
window_size = enc_seq_len+output_sequence_length
max_seq_len = enc_seq_len 
normalize = True
dropout_probability = .2
n_epochs = 40
DEVICE = 'cpu'
lr=2e-5

transformer_data = new_data.drop(columns=['Date'])
training_indices = get_indices_entire_sequence(transformer_data, window_size, 1)
training_dataset = TransformerDataset(transformer_data,
                                      training_indices,
                                      enc_seq_len,
                                      dec_seq_len,
                                      output_sequence_length,
                                      normalize=normalize)

training_dataloader = torch.utils.data.DataLoader(training_dataset, batch_size)

m = torch.load('temp/transformer.pth/data/model.pth')
m.train()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(m.parameters(), lr=lr)


with mlflow.start_run(experiment_id=exp_id,
                      run_name="Transformer update"):
    mlflow.set_tag('ticker', 'AAPL')
    
    for epoch in tqdm(range(n_epochs)):    
        train_epoch_losses = []

        for src, trg, trg_y in training_dataloader:
            src = src.to(device=DEVICE)
            trg = trg.to(device=DEVICE)
            trg_y = trg_y.to(device=DEVICE)[:, :, 3].unsqueeze(-1)
            optimizer.zero_grad()
            trg_y = trg_y.permute(1, 0, 2)
            output = model.inner_forward(src, trg)
            loss = criterion(output, trg_y)
        
            train_epoch_losses.append(loss.item())
            loss.backward()
            optimizer.step()
        
        mlflow.log_metric("train_loss", np.mean(train_epoch_losses), step=epoch)
        if (epoch+1) % 10 == 0:
            print(f'Epoch {epoch+1}/{n_epochs}, Training Loss: {np.mean(train_epoch_losses):.4f}')
    signature = mlflow.models.signature.infer_signature(np.array(torch.tensor(transformer_data.values).numpy()).astype(np.float64))
    
    mlflow.pytorch.log_model(model,
             'transformer.pth',
             registered_model_name='CompatibleTransformer',
             signature = signature,
             input_example = np.array(torch.tensor(transformer_data.values).numpy())
             )
    

In [110]:
vers = []
for mv in client.search_model_versions("name='CompatibleTransformer'"):
    vers.append(int(mv.version))
version = max(vers)

client.transition_model_version_stage(
    name="CompatibleTransformer",
    version=version,
    stage="Staging"
)

for ver in range(1, version):
    client.transition_model_version_stage(
        name="CompatibleTransformer",
        version=ver,
        stage="Archived"
    )

shutil.rmtree('temp/')

print('done')

done


#### ARIMA

In [173]:
from statsmodels.tsa.arima.model import ARIMA
import numpy as np
import pmdarima

In [174]:
# experiment_id = mlflow.create_experiment(
#     "ARIMA update",
#     artifact_location="sftp://{}:{}@{}:/home/godder/mlflow_storage/artifacts".format(MLFLOW_USER, MLFLOW_PASSWORD, MLFLOW_IP),
#     tags={"version": ".1"},
# )

In [175]:
vers = []
for mv in client.search_model_versions("name='ARIMA'"):
    vers.append(int(mv.version))
arima_latest = max(vers)

In [176]:
exp_id = dict(mlflow.get_experiment_by_name("ARIMA update"))['experiment_id']

In [177]:
mlflow.set_experiment(experiment_id=exp_id)
print("Set experiment")

Set experiment


In [178]:
mlflow.start_run(experiment_id=exp_id, run_name='ARIMA updating')

<ActiveRun: >

In [179]:
new_df  = new_data[['Date','Close']].copy()
new_df['Close'] = np.log(new_df['Close'])

In [180]:
old_model = mlflow.pmdarima.load_model(
    model_uri=f"models:/ARIMA/{arima_latest}"
)

In [181]:
new_model = ARIMA(new_df['Close'], order=old_model.order)
updated_model = new_model.fit()

In [182]:
m = pmdarima.auto_arima(new_df['Close'])

In [183]:
signature = mlflow.models.signature.infer_signature(pd.DataFrame([{"n_periods": 10, "return_conf_int": True, "alpha": 0.1}]))

  inputs = _infer_schema(model_input)


In [184]:
mlflow.pmdarima.log_model(m, 'ARIMA', registered_model_name='ARIMA',
                         signature=signature,
                         input_example=pd.DataFrame([{"n_periods": 10, "return_conf_int": True, "alpha": 0.1}]))

Registered model 'ARIMA' already exists. Creating a new version of this model...
2023/01/19 20:33:31 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: ARIMA, version 9
Created version '9' of model 'ARIMA'.


<mlflow.models.model.ModelInfo at 0x7f6a06c20070>

In [185]:
mlflow.end_run()

In [186]:
version = arima_latest + 1
client.transition_model_version_stage(
    name="ARIMA",
    version=version,
    stage="Staging"
)

for ver in range(1, version):
    client.transition_model_version_stage(
        name="ARIMA",
        version=ver,
        stage="Archived"
    )
print('done')

done
