In [1]:
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
import torch
from torch.utils.data.datapipes import iter
import math
import torch.nn as nn 
from functools import partial
from ray.air import session
import numpy as np
import os
import matplotlib.pyplot as plt 

from os.path import isfile, join

 

path = './data'

In [2]:
def clean_and_split_data(data_set):
    data_set = data_set.split(' ')
    data_set = list(filter(lambda x: x != '', data_set))
    return data_set

def encode_data(data, char2index):
  data = [char2index[c] for c in data]
  return data 

def decode_data(data, index2char):
  data = [index2char[c] for c in data]
  return data 


def load_data(path):

  with open(join(path,'ptb.train.txt')) as f:
      train_data = f.read()
      
  with open(join(path,'ptb.valid.txt')) as f:
      val_data = f.read()
      
  with open(join(path,'ptb.test.txt')) as f:
      test_data = f.read()
       

  ## clean and tokenize 
  train_data =clean_and_split_data(train_data)
  val_data =clean_and_split_data(val_data)  
  test_data =clean_and_split_data(test_data)

  ## vocab
  words = sorted(set(train_data+val_data+test_data))

  ##encoder
  char2index = {c: i for i, c in enumerate(words)}
  ##decoder 
  index2char = {i: c for i, c in enumerate(words)}



  ## encode 
  train_data = encode_data(train_data,char2index)
  val_data = encode_data(val_data,char2index)
  test_data = encode_data(test_data,char2index)
  
  return train_data, val_data, test_data, index2char,words


In [3]:
def make_batchs(data,batch_size):
  try :
    data = iter.Batcher(data,batch_size = batch_size, drop_last=True)
  except:
    print('needs to get an itrable Wraped data - make seq first')

  return data 

def make_seq(data,seq_length):
  label = iter.IterableWrapper(data[1:])
  data = iter.IterableWrapper(data)
   
  label = iter.Batcher(label,batch_size = seq_length, drop_last=True)
  data = iter.Batcher(data,batch_size = seq_length, drop_last=True)

  return data,label
  

In [4]:

class wordModel(nn.Module):
    
    def __init__(self,words,device, num_hidden=200, num_layers=2,drop_prob=0.5,with_drop = False):
        
        
        # SET UP ATTRIBUTES
        super().__init__()
        self.drop_prob = drop_prob
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        self.words = words
        self.with_drop = with_drop
        
      
      
        self.lstm = nn.LSTM(num_hidden, num_hidden, num_layers, dropout=drop_prob, batch_first=True)
        
        self.dropout = nn.Dropout(drop_prob)
        
        self.fc_linear = nn.Linear(num_hidden, len(self.words))
      
        self.embeding = nn.Embedding(len(words),self.num_hidden)

        self.device = device  


    def forward(self, x, hidden):
                  
        x = self.embeding(x)
       
        lstm_output, hidden = self.lstm(x, hidden)
        
        if self.with_drop:
          drop_output = self.dropout(lstm_output)
          
          drop_output = drop_output.contiguous().view(-1, self.num_hidden)
          final_out = self.fc_linear(drop_output)

        else:
          out = lstm_output.contiguous().view(-1, self.num_hidden)
          final_out = self.fc_linear(out)
        
        
        return final_out, hidden
    
    
    def hidden_state(self, batch_size):

        if self.device == 'cuda':
            
            hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda(),
                     torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda())
        else:
            hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden),
                     torch.zeros(self.num_layers,batch_size,self.num_hidden))
        
    
        return hidden


In [5]:
def plot_graph(num_epochs, training_perplexity, validation_perplexity, dropout=False):
    x = [el+1 for el in range(num_epochs)]
    y_1 = training_perplexity
    y_2 = validation_perplexity
    title = None
    if dropout:
        title = 'Dropout'
    else:
        title = 'No Dropout'
    plt.plot(x, y_1, color='blue')
    plt.plot(x, y_2, color='orange')
    plt.xlabel('Epoch number')
    plt.ylabel('Perplexity')
    plt.legend(['training perplexity', 'validation perplexity'])
    plt.title(title)
    plt.show()
    print(f'best training perplexity was {min(training_perplexity)}')
    print(f'best testing perplexity was {min(validation_perplexity)}')


def calculate_perplexity(losses):
    total_loss = sum(losses)
    average_loss = total_loss / len(losses)
    perplexity = math.exp(average_loss)
    return perplexity

def train_eval_lstm(config,checkpoint_dir=None):

  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  path = r'/home/robotics20/Documents/rotem/learn/deep/data/data'

  ## read file and split 
  train_data, val_data, test_data, index2char,words = load_data(path) 
  ## make seq and batches 
  train_data,train_label = make_seq(train_data,config["seq_length"])

  train_data = make_batchs(train_data,config["batch_size"])
  train_label = make_batchs(train_label,config["batch_size"])


  val_data,val_label = make_seq(val_data,config["seq_length"])

  val_data = make_batchs(val_data,config["batch_size"])
  val_label = make_batchs(val_label,config["batch_size"])


  ### to tensors 
  train_data = torch.tensor(np.array(train_data),dtype=torch.int)
  train_label = torch.tensor(np.array(train_label),dtype=torch.int)


  val_data = torch.tensor(np.array(val_data),dtype=torch.int)
  val_label = torch.tensor(np.array(val_label),dtype=torch.int)

  model = wordModel(
    words = words,
    device = device,
    num_hidden=config["num_hidden"],
    num_layers=config["num_hidden"],
    drop_prob=0.5,
    with_drop=False,
  )

  model = model.to(device)


  optimizer = torch.optim.SGD(model.parameters(),lr=config["lr"],momentum=config["momentum"],weight_decay=config["weight_decay"])
  criterion = nn.CrossEntropyLoss()
  schedualer = torch.optim.lr_scheduler.ExponentialLR(
        optimizer=optimizer, gamma=config["gamma"])

  train_losses = []
  val_losses = []
  train_preplexity = []
  val_preplexity = []
  print("starts training ")

  for i in range(int(config["epoch"])):
    model.train()
    hidden = model.hidden_state(batch_size=config["batch_size"])
    print(i)
    
    for j, featurs in enumerate(train_data):
      labels = train_label[j]
      featurs = featurs.to(device)
      labels = labels.to(device)
      
      hidden = tuple([state.data for state in hidden])    
      lstm_output, hidden = model.forward(featurs,hidden)
      
      loss = criterion(lstm_output,labels.view(config["batch_size"]*config["seq_length"]).long())
      loss.backward()
      nn.utils.clip_grad_norm_(model.parameters(),max_norm=5)

      optimizer.step()
      optimizer.zero_grad()
      
    schedualer.step()
    ### validation 
    model.eval()
    val_hidden = model.hidden_state(config["batch_size"])
    for k, val_featurs in enumerate(val_data):
      val_labels = val_label[k]
      val_featurs = val_featurs.to(device)
      val_labels = val_labels.to(device)

      val_hidden = tuple([state.data for state in val_hidden])   
      lstm_output, val_hidden = model.forward(val_featurs,val_hidden)
      val_loss = criterion(lstm_output,val_labels.view(config["batch_size"]*config["seq_length"]).long())

    model.train()


    train_losses.append(loss.item())
    #preplexcity
    train_preplexity.append(calculate_perplexity(train_losses)) 
    val_losses.append(val_loss.item())
    #preplexcity
    val_preplexity.append(calculate_perplexity(val_losses))
    print(f"Epoch: {i} Step: {j} trainloss: {loss.item()} Val Loss: {val_loss.item()}")
   
    with tune.checkpoint_dir(i) as checkpoint_dir:
      path = os.path.join(checkpoint_dir, "checkpoint")
      torch.save((model.state_dict(), optimizer.state_dict()), path)

    tune.report(loss=val_loss.item())
  return train_preplexity, val_preplexity


In [6]:

max_num_epochs = 20
num_samples = 5
config = {
    "gamma": tune.loguniform(0.9, 0.99),
    "weight_decay": tune.loguniform(0.0001,0.001 ),
    "lr": tune.loguniform(1, 6),
    "momentum": tune.loguniform(0.5, 0.9),
    "num_hidden": tune.choice([100, 200, 300, 400]),
    "num_layers": tune.choice([1, 2, 3, 4]),
    "seq_length": tune.choice([5, 10, 20, 30,40,50]),
    "batch_size": tune.choice([10, 20, 30, 40,50]),
    "epoch": tune.uniform(5,5)
}
scheduler = ASHAScheduler(
    metric="loss",
    mode="min",
    max_t=max_num_epochs,
    grace_period=1,
    reduction_factor=2)
reporter = CLIReporter(
    # ``parameter_columns=["l1", "l2", "lr", "batch_size"]``,
    metric_columns=["loss","val_preplexity","epoch"])
result = tune.run(
    partial(train_eval_lstm ),
    resources_per_trial={"cpu": 20, "gpu": 1},
    config=config,
    num_samples=num_samples,
    scheduler=scheduler,
    progress_reporter=reporter)
best_trial = result.get_best_trial("val_loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(
    best_trial.last_result["val_loss"]))
print("Best trial final val_preplexity: {}".format(
    best_trial.last_result["val_preplexity"]))

2023-05-01 14:19:03,924	INFO worker.py:1625 -- Started a local Ray instance.
2023-05-01 14:19:04,479	INFO tune.py:218 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.

from ray.air import session

def train(config):
    # ...
    session.report({"metric": metric}, checkpoint=checkpoint)

For more information please see https://docs.ray.io/en/latest/tune/api/trainable.html

2023-05-01 14:19:04,505	INFO tensorboardx.py:172 -- pip install "ray[tune]" to see TensorBoard files.


== Status ==
Current time: 2023-05-01 14:19:04 (running for 00:00:00.05)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Logical resource usage: 20.0/24 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /home/robotics20/ray_results/train_eval_lstm_2023-05-01_14-19-04
Number of trials: 5/5 (4 PENDING, 1 RUNNING)
+-----------------------------+----------+-----------------------+--------------+---------+----------+---------+------------+--------------+--------------+--------------+----------------+
| Trial name                  | status   | loc                   |   batch_size |   epoch |    gamma |      lr |   momentum |   num_hidden |   num_layers |   seq_length |   weight_decay |
|-----------------------------+----------+-----------------------+--------------+---------+----------+---------+------------+--------------+--------------+--------------+----------------|
| train_eval_lstm_fb3

Trial name,date,done,hostname,iterations_since_restore,loss,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,training_iteration,trial_id
train_eval_lstm_fb35d_00000,2023-05-01_14-23-11,False,robotics20-Z690-UD,5.0,6.7170090675354,132.66.210.174,161877,True,245.9320294857025,47.35796213150024,245.9320294857025,1682940191,5.0,fb35d_00000
train_eval_lstm_fb35d_00001,2023-05-01_14-23-13,,robotics20-Z690-UD,,,132.66.210.174,162361,,,,,1682940193,,fb35d_00001
train_eval_lstm_fb35d_00002,2023-05-01_14-23-21,,robotics20-Z690-UD,,,132.66.210.174,162439,,,,,1682940201,,fb35d_00002
train_eval_lstm_fb35d_00003,2023-05-01_14-41-35,False,robotics20-Z690-UD,2.0,6.670569896697998,132.66.210.174,162525,True,1087.6705679893494,540.2288727760315,1087.6705679893494,1682941295,2.0,fb35d_00003


[2m[36m(func pid=161877)[0m 1
== Status ==
Current time: 2023-05-01 14:20:03 (running for 00:00:59.11)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: -6.721224784851074
Logical resource usage: 20.0/24 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /home/robotics20/ray_results/train_eval_lstm_2023-05-01_14-19-04
Number of trials: 5/5 (4 PENDING, 1 RUNNING)
+-----------------------------+----------+-----------------------+--------------+---------+----------+---------+------------+--------------+--------------+--------------+----------------+---------+
| Trial name                  | status   | loc                   |   batch_size |   epoch |    gamma |      lr |   momentum |   num_hidden |   num_layers |   seq_length |   weight_decay |    loss |
|-----------------------------+----------+-----------------------+--------------+---------+----------+---------+------------+--------------+--

== Status ==
Current time: 2023-05-01 14:23:25 (running for 00:04:21.04)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: -6.717816352844238 | Iter 2.000: -6.719817161560059 | Iter 1.000: -6.721224784851074
Logical resource usage: 20.0/24 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /home/robotics20/ray_results/train_eval_lstm_2023-05-01_14-19-04
Number of trials: 5/5 (2 ERROR, 2 PENDING, 1 RUNNING)
+-----------------------------+----------+-----------------------+--------------+---------+----------+---------+------------+--------------+--------------+--------------+----------------+---------+
| Trial name                  | status   | loc                   |   batch_size |   epoch |    gamma |      lr |   momentum |   num_hidden |   num_layers |   seq_length |   weight_decay |    loss |
|-----------------------------+----------+-----------------------+--------------+---------+----------+---------+------------+-------------

== Status ==
Current time: 2023-05-01 14:23:31 (running for 00:04:27.04)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: -6.717816352844238 | Iter 2.000: -6.719817161560059 | Iter 1.000: -6.721224784851074
Logical resource usage: 20.0/24 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /home/robotics20/ray_results/train_eval_lstm_2023-05-01_14-19-04
Number of trials: 5/5 (3 ERROR, 1 PENDING, 1 RUNNING)
+-----------------------------+----------+-----------------------+--------------+---------+----------+---------+------------+--------------+--------------+--------------+----------------+---------+
| Trial name                  | status   | loc                   |   batch_size |   epoch |    gamma |      lr |   momentum |   num_hidden |   num_layers |   seq_length |   weight_decay |    loss |
|-----------------------------+----------+-----------------------+--------------+---------+----------+---------+------------+-------------