In [1]:
%load_ext autoreload
%autoreload 2
import sys

# instead of creating a package using setup.py or building from a docker/singularity file,
# import the sister directory of src code to be called on in notebook.
# This keeps the notebook free from code to only hold visualizations and is easier to test
# It also helps keep the state of variables clean such that cells aren't run out of order with a mysterious state
sys.path.append("..")

In [2]:
'''
Code taken from: 

https://towardsdatascience.com/how-to-train-a-regression-model-using-a-random-forest-c1cf16288f6b

https://medium.com/@pratyush.sinha/training-random-forest-by-back-propagation-for-fun-pytorch-part-1-a54674355aa7
'''

'\nCode taken from: \n\nhttps://towardsdatascience.com/how-to-train-a-regression-model-using-a-random-forest-c1cf16288f6b\n\nhttps://medium.com/@pratyush.sinha/training-random-forest-by-back-propagation-for-fun-pytorch-part-1-a54674355aa7\n'

In [3]:
import torch
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn as nn
import torch.optim as optim


from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from data import create_data_for_lstm
import numpy as np
import pandas as pd 



In [4]:
print("Am I using GPUS ???", torch.cuda.is_available())
print("Number of gpus: ", torch.cuda.device_count())

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.cuda.set_device(device)
print(device)
torch.manual_seed(101)

Am I using GPUS ??? True
Number of gpus:  1
cuda:0


<torch._C.Generator at 0x7f4b52c744d0>

In [5]:
# create LSTM Model
class SequenceDataset(Dataset):
    def __init__(
        self,
        dataframe,
        target,
        features,
        stations,
        sequence_length,
        forecast_hr,
        device,
    ):
        self.dataframe = dataframe
        self.features = features
        self.target = target
        self.sequence_length = sequence_length
        self.stations = stations
        self.forecast_hr = forecast_hr
        self.device = device
        self.y = torch.tensor(dataframe[target].values).float().to(device)
        self.X = torch.tensor(dataframe[features].values).float().to(device)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, i):
        if i >= self.sequence_length - 1:
            i_start = i - self.sequence_length + 1
            x = self.X[i_start : (i + 1), :]
            x[: self.forecast_hr, -int(len(self.stations) * 15) :] = x[
                self.forecast_hr + 1, -int(len(self.stations) * 15) :
            ]
        else:
            padding = self.X[0].repeat(self.sequence_length - i - 1, 1)
            x = self.X[0 : (i + 1), :]
            x = torch.cat((padding, x), 0)
        
        # x = x.reshape(X.shape[1]*self.sequence_length, 784)
        return x, self.y[i]

In [6]:
station = 'BKLN'
fh = 4
sequence_length = 120
batch_size = int(20e2)
df_train, df_test, features, forecast_lead, stations, target = create_data_for_lstm.create_data_for_model(station, fh)

Targeting Error for BKLN
-- loading data from NYSM --
-- loading data from HRRR --
now = 2024-01-19 19:23:56.695599
Test Set Fraction 0.20000445394619634
Data Processed


In [7]:
train_dataset = SequenceDataset(
    df_train,
    target=target,
    features=features,
    stations=stations,
    sequence_length=sequence_length,
    forecast_hr=fh,
    device=device,
)
test_dataset = SequenceDataset(
    df_test,
    target=target,
    features=features,
    stations=stations,
    sequence_length=sequence_length,
    forecast_hr=fh,
    device=device,
)

In [8]:
train_kwargs = {"batch_size": batch_size, "pin_memory": False, "shuffle": True}
test_kwargs = {"batch_size": batch_size, "pin_memory": False, "shuffle": False}

train_loader = torch.utils.data.DataLoader(train_dataset, **train_kwargs)
test_loader = torch.utils.data.DataLoader(test_dataset, **test_kwargs)

In [9]:
class feature_selection_node(nn.Module):
    def __init__(self, number_of_trees, batch_size, device):
        super(feature_selection_node, self).__init__()
        self.number_of_trees = number_of_trees
        self.attention_mask = torch.nn.Parameter(data=torch.Tensor(number_of_trees, 1000), requires_grad=True,)
        self.attention_mask.data.uniform_(-1.0,1.0)
        self.batch = batch_size
        self.device = device

    def forward(self, x):
        x.to(self.device)
        x = x.view(-1,(x.shape[1]*x.shape[2]))
        attention_tmp = torch.sigmoid(self.attention_mask).to(self.device)
        #scatter mask by only keeping top 200 vals and reset rest to 0
        topk, idx = torch.topk(attention_tmp, k=200, dim=-1)
        topk.to(self.device)
        idx.to(self.device)
        attention = torch.zeros(self.number_of_trees, 16080).to(self.device)
        attention.scatter_(-1,idx, topk)
        return_value=torch.zeros(self.batch,self.number_of_trees, 16080)
        print(x.shape)
        print(topk.shape)
        print(idx.shape)
        for mask_index in range(0,self.number_of_trees):
            return_value[:,mask_index,:] = x*attention[mask_index]
        return return_value, attention


In [10]:
class decision_node(nn.Module):
    def __init__(self, number_of_trees, max_num_of_leaf_nodes, classes, batch, device):
        super(decision_node, self).__init__()
        self.leaf = max_num_of_leaf_nodes
        self.tree = number_of_trees
        self.classes = classes
        self.batch = batch
        self.symbolic_path_weights = nn.Linear(16080,max_num_of_leaf_nodes, bias=True)
        
        self.hardtanh = nn.Hardtanh()
        self.softmax = nn.Softmax(dim=-1)
        self.contribution = torch.nn.Parameter(data=torch.Tensor(number_of_trees,max_num_of_leaf_nodes, classes), requires_grad=True)
        self.contribution.data.uniform_(-1.0, 1.0)
        self.device = device
        #define trainable params here

    def forward(self, x):
        x.to(self.device)
        # use trainable params to define compuatations here
        class_value = torch.randn(self.batch,self.tree, self.leaf, self.classes)
        symbolic_paths = self.hardtanh(self.symbolic_path_weights(x))

        for tree_index in range(0,self.tree):
            for decision_index in range(0, self.leaf):
                class_value[:,tree_index, decision_index,:]=torch.mm(symbolic_paths[:,tree_index, decision_index].view(-1,1),self.contribution[tree_index, decision_index].view(1,-1))
        class_value=self.softmax(class_value)
        class_value = 1.0-class_value*class_value
        class_value = class_value.sum(dim=-1)
        return symbolic_paths, class_value


In [11]:
def frequency(d):
    dic={}

    for item in d:
        if item in dic.keys():
            dic[item] = dic[item]+1
        else:
            dic[item]=1
    
    dic = {"value":dic.keys(), "count":dic.values()}
    df = pd.DataFrame.from_dict(dic, orient='index').transpose().sort_values(['value'])
    df['cum'] = df['count']/df['count'].sum()
    value = df['cum'].values
    value = torch.from_numpy(value).float()
    value = 1-value*value
    value = value.sum(-1)
    return value

In [12]:
def train_model(epoch, device):
    print("Hello World!")
    print(device)
    mask.train()
    decision.train()
    flag = torch.ones(1000, 100, 200)
    flag = flag.to(device)

    for batch_idx, (data, target) in enumerate(train_loader):
        print(batch_idx)
        optimizer.zero_grad()
        masked_output, attention = mask(data)
        decision_output, weights = decision(masked_output)
        weights_numpy = weights.detach().cpu().numpy()
        weights_numpy=np.roll(weights_numpy,1,axis=-1)
        weights_numpy[:,:,0] = frequency(target.cpu().numpy())
        print("check")
        weights_output=torch.from_numpy(weights_numpy).float()
        weights_output = weights_output.to(device)
        weights = weights.to(device)
        decision_output = decision_output.to(device)
        target = target.to(device)
        print('check0')
        print(weights_output.is_cuda)
        print(weights.is_cuda)
        print(flag.is_cuda)
        print(data.is_cuda)
        loss = torch.nn.MarginRankingLoss(margin=1e-7)(weights_output, weights,flag)
        print("check1")
        loss.backward()
        print("check2")
        optimizer.step()
        print("check3")

        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_idx*len(data), len(train_loader.dataset), 100.*batch_idx/len(train_loader), loss.item()))
            train_loss.append(loss.item())
            train_counter.append((batch_idx*batch_size_train) + ((epoch-1)*len(train_loader.dataset)))

In [13]:
epochs = 5
log_interval = 10
train_loss = []
train_counter = []
test_loss = []
test_counter = [i*len(train_loader.dataset) for i in range(epochs+1)]

In [14]:
mask = feature_selection_node(100, batch_size, device)
decision = decision_node(100,200,10,batch_size, device)
params = list(mask.parameters()) + list(decision.parameters())
optimizer = optim.SGD(params, lr=1e-3, momentum=0.5)

In [15]:
for epoch in range(1, epochs+1):
    print(epoch)
    train_model(epoch, device)

1
Hello World!
cuda:0
0


: 

: 

In [None]:
# Create an instance of RandomForestClassifier
#Create an object (model)
rf = RandomForestRegressor(n_estimators=100, max_depth=12, min_samples_split=13, random_state=101)

In [None]:
df_train

In [None]:
X = df_train.iloc[:,1:].values
Y = df_train.iloc[:,0].values

In [None]:
X.shape

In [None]:
Y.shape

In [None]:
# Train the classifier on the training data
rf.fit(X, Y)

In [None]:
# from sklearn.tree import export_graphviz
# import graphviz

# dot_data = export_graphviz(rf.estimators_[0], feature_names=features,  
#                            filled=True, rounded=True)  

# graph = graphviz.Source(dot_data, format='png')
# graph.render("tree") 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.10,
                                                    random_state=2,
                                                    shuffle=True)

from sklearn.model_selection import RandomizedSearchCV

model = RandomForestRegressor()

rs = RandomizedSearchCV(model, n_iter=10,
                        param_distributions = {'max_depth': range(1, 15),
                                               'min_samples_split': range(2, 50)},
                        cv=5, n_jobs=-1, random_state=3,
                        scoring='neg_mean_squared_error')

rs.fit(X_train, y_train)
print(rs.best_params_)
print(-rs.best_score_)

In [None]:
'{'min_samples_split': 13, 'max_depth': 12}
0.17789285797670434'