In [16]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import random
import time
from tqdm import tqdm
from matplotlib import pyplot as plt

In [17]:
df = pd.read_csv("../data/data.csv")
df.head(7)

Unnamed: 0,weekday_name,month,leap_year_condition,decade,output,output_year_digit,output_year,valid_years_list,valid_day_list,decade4,decade100,decade400,valid_group_days_index
0,2,1,0,180,1-1-1800,0,1800,"[0, 1, 2, 3, 5, 6, 7, 9]","[1, 8, 15, 22, 29]",0,0,1,7
1,3,1,0,180,1-1-1801,1,1801,"[0, 1, 2, 3, 5, 6, 7, 9]","[1, 8, 15, 22, 29]",0,0,1,7
2,4,1,0,180,1-1-1802,2,1802,"[0, 1, 2, 3, 5, 6, 7, 9]","[1, 8, 15, 22, 29]",0,0,1,7
3,5,1,0,180,1-1-1803,3,1803,"[0, 1, 2, 3, 5, 6, 7, 9]","[1, 8, 15, 22, 29]",0,0,1,7
4,6,1,1,180,1-1-1804,4,1804,"[4, 8, 4, 4, 4, 4, 4, 4]","[1, 8, 15, 22, 29]",0,0,1,7
5,1,1,0,180,1-1-1805,5,1805,"[0, 1, 2, 3, 5, 6, 7, 9]","[1, 8, 15, 22, 29]",0,0,1,7
6,2,1,0,180,1-1-1806,6,1806,"[0, 1, 2, 3, 5, 6, 7, 9]","[1, 8, 15, 22, 29]",0,0,1,7


In [36]:
df.valid_group_days_index.value_counts()

7    22540
8    22055
3    19248
4    19248
5    19248
6    19248
9    14035
2     8020
1     1604
0     1216
Name: valid_group_days_index, dtype: int64

In [41]:
df.output_year.nunique()

401

In [19]:
x_day = df[["weekday_name", "month", "output_year", "leap_year_condition", "valid_group_days_index"]]
y_day = x_day.pop("valid_group_days_index") # from 0 to 9
# v = pd.DataFrame([i for i in df["valid_day_list"]])

**B- training for getting the day**

In [20]:
# fix random number generation aka regenerate the same random numbers every time (such as weight and bias initialization )
def set_random_seed(seed=7, deterministic=True):
    """Set random seed, for python, numpy, pytorch

    Args:
        seed (int): Seed to be used.
        deterministic (bool): Whether to set the deterministic option for
            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
            to True and `torch.backends.cudnn.benchmark` to False.
            Default: True.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    if deterministic:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False 
seed=7        
set_random_seed(seed=seed)

In [43]:
batch_size = 512
xtrain_day, xtest_day, ytrain_day, ytest_day = train_test_split(x_day, y_day, test_size=0.19, shuffle=True, random_state=seed)

In [44]:
class CollectedData(Dataset):
    def __init__(self, x, y):
        self.data = torch.tensor(x.values.astype(np.float32))
        self.label = torch.tensor(y.values)
        self.n_smpl = x.shape[0]
        
        
    def __getitem__(self, idx):
        return self.data[idx], self.label[idx]
    
    def __len__(self):
        return self.n_smpl    
    
train_set_day = CollectedData(xtrain_day, ytrain_day)
test_set_day = CollectedData(xtest_day, ytest_day)    

In [45]:
# # this isn't suitable here, so that i used train_test_split function
# # train_set, test_set = torch.utils.data.random_split(dataset, [len(dataset)-test_len, int(len(dataset)*0.2)])

# train_labels_day = torch.tensor(ytrain_day.values.astype(np.float32)) 
# test_labels_day = torch.tensor(ytest_day.values.astype(np.float32)) 
# train_input_day = torch.tensor(xtrain_day.values.astype(np.float32)) 
# test_input_day = torch.tensor(xtest_day.values.astype(np.float32)) 

# train_set_day = TensorDataset(train_input_day, train_labels_day)
# test_set_day = TensorDataset(test_input_day, test_labels_day)


In [46]:
train_loader_day = DataLoader(dataset=train_set_day, shuffle=True, batch_size=batch_size)
test_loader_day = DataLoader(dataset=test_set_day, batch_size=batch_size) 

In [62]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class network(nn.Module):
    def __init__(self, in_features=4, out_features=10):
        super().__init__()
        self.fc1 = nn.Linear(in_features, 168)
        self.fc2 = nn.Linear(168, 401)
        self.fc3 = nn.Linear(401, 168)
        self.fc4 = nn.Linear(168, 33)
        # self.fc5 = nn.Linear(11, 11)
        self.fc6 = nn.Linear(33, out_features) 
        self.initialize_weights()
        
        
    def forward(self, inpt):
        out = F.leaky_relu(self.fc1(inpt))
        out = F.leaky_relu(self.fc2(out))
        out = F.leaky_relu(self.fc3(out))
        out = F.tanh(self.fc4(out))
        # out = F.leaky_relu(self.fc5(out))
        out = ((self.fc6(out)))
        return out
    
    def initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight)
                nn.init.constant_(m.bias, 0)

In [63]:
# calculating accuracy
@torch.no_grad()
def calculate_accuracy(model, data_loader=train_loader_day):
    model.eval()
    
    num_correct = 0
    num_samples = 0

    for data, labels in data_loader:
        
        # transfering data to cuda
        data = data.to(device=device)
        labels = labels.to(device=device)
                
        preds = model(data)
        # print(preds.argmax(dim=1), labels)
        num_correct += sum(list(preds.argmax(dim=1)==labels))
        num_samples += len(labels)
    accuracy = num_correct/num_samples
    model.train()
    return accuracy  
calculate_accuracy(model, data_loader=test_loader_day)

tensor(0.1312, device='cuda:0')

In [66]:
# initializing the network
model = network(in_features=4, out_features=10).to(device)
lr = 0.5
# loss and optimizer initializing
criterion = nn.CrossEntropyLoss()
# criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
schedular = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=35, verbose=True)
model

network(
  (fc1): Linear(in_features=4, out_features=168, bias=True)
  (fc2): Linear(in_features=168, out_features=401, bias=True)
  (fc3): Linear(in_features=401, out_features=168, bias=True)
  (fc4): Linear(in_features=168, out_features=33, bias=True)
  (fc6): Linear(in_features=33, out_features=10, bias=True)
)

In [None]:
num_epochs = 33
def train_model(num_epochs, data_loader=train_loader_day):
    num_batches = len(data_loader)
    print(f"starting learning rate = {lr} \n number of epochs = {num_epochs} \n number of batches = {num_batches} \n")
    # starting training loop epochs
    result_train_acc, result_test_acc = [], []
    for epoch in range(num_epochs):
        start_time = time.time()
        progress = tqdm(enumerate(data_loader), total=num_batches, leave=True)
        for batch_idx, (data, labels) in progress: 

            # convert data to device
            data = data.to(device=device)
            labels = labels.to(device=device)

            # getting prediction and loss
            preds = model(data)
            loss = criterion(preds, labels)

            # back propagation
            optimizer.zero_grad()
            loss.backward()

            # optimization step
            optimizer.step()

            progress.set_description(f"epoch [{1+epoch}/{num_epochs}], loss={loss.item():0.4f}")
            progress.set_postfix()

        train_acc = calculate_accuracy(model, data_loader=train_loader_day)
        test_acc = calculate_accuracy(model, data_loader=test_loader_day)
        schedular.step(test_acc)
        print(f"after {1+epoch} epoch, train_acc = {(train_acc*100):.2f}%, test_acc = {(test_acc*100):.2f}%, time_elapsed = {((time.time()-start_time)/60):.1f} minuts")
        result_train_acc += [train_acc]
        result_test_acc += [test_acc]
    return result_train_acc, result_test_acc
result_train_acc, result_test_acc = train_model(num_epochs, data_loader=train_loader_day)


starting learning rate = 0.5 
 number of epochs = 33 
 number of batches = 232 



epoch [1/33], loss=10.0608: 100%|████████████████████████████████████████████████████| 232/232 [00:06<00:00, 34.37it/s]


after 1 epoch, train_acc = 13.15%, test_acc = 13.12%, time_elapsed = 0.2 minuts


epoch [2/33], loss=2.2641: 100%|█████████████████████████████████████████████████████| 232/232 [00:04<00:00, 49.18it/s]


after 2 epoch, train_acc = 15.10%, test_acc = 14.86%, time_elapsed = 0.2 minuts


epoch [3/33], loss=3.1786: 100%|█████████████████████████████████████████████████████| 232/232 [00:06<00:00, 37.18it/s]


after 3 epoch, train_acc = 15.10%, test_acc = 14.86%, time_elapsed = 0.2 minuts


epoch [4/33], loss=2.2463: 100%|█████████████████████████████████████████████████████| 232/232 [00:03<00:00, 65.38it/s]


after 4 epoch, train_acc = 13.15%, test_acc = 13.12%, time_elapsed = 0.1 minuts


epoch [5/33], loss=3.0687: 100%|█████████████████████████████████████████████████████| 232/232 [00:03<00:00, 67.82it/s]


after 5 epoch, train_acc = 13.17%, test_acc = 13.03%, time_elapsed = 0.2 minuts


epoch [6/33], loss=3.3534: 100%|███████████████████████████████████████████████████| 232/232 [2:41:15<00:00, 41.70s/it]


after 6 epoch, train_acc = 13.10%, test_acc = 13.33%, time_elapsed = 161.4 minuts


epoch [7/33], loss=2.2707: 100%|█████████████████████████████████████████████████████| 232/232 [00:04<00:00, 54.11it/s]


after 7 epoch, train_acc = 13.15%, test_acc = 13.12%, time_elapsed = 0.2 minuts


epoch [8/33], loss=9.9490: 100%|█████████████████████████████████████████████████████| 232/232 [00:06<00:00, 34.37it/s]


after 8 epoch, train_acc = 15.45%, test_acc = 15.14%, time_elapsed = 0.2 minuts


epoch [9/33], loss=2.4136: 100%|█████████████████████████████████████████████████████| 232/232 [00:05<00:00, 39.62it/s]


after 9 epoch, train_acc = 15.10%, test_acc = 14.86%, time_elapsed = 0.2 minuts


epoch [10/33], loss=2.1734: 100%|████████████████████████████████████████████████████| 232/232 [00:03<00:00, 58.86it/s]


after 10 epoch, train_acc = 15.45%, test_acc = 15.14%, time_elapsed = 0.1 minuts


epoch [11/33], loss=3.3623: 100%|████████████████████████████████████████████████████| 232/232 [00:05<00:00, 41.58it/s]


after 11 epoch, train_acc = 15.10%, test_acc = 14.86%, time_elapsed = 0.2 minuts


epoch [12/33], loss=2.2911: 100%|████████████████████████████████████████████████████| 232/232 [00:03<00:00, 65.47it/s]


after 12 epoch, train_acc = 13.10%, test_acc = 13.33%, time_elapsed = 0.1 minuts


epoch [13/33], loss=10.1259: 100%|███████████████████████████████████████████████████| 232/232 [00:05<00:00, 41.39it/s]


after 13 epoch, train_acc = 13.17%, test_acc = 13.03%, time_elapsed = 0.2 minuts


epoch [14/33], loss=2.1630: 100%|████████████████████████████████████████████████████| 232/232 [00:03<00:00, 59.71it/s]


after 14 epoch, train_acc = 15.45%, test_acc = 15.14%, time_elapsed = 0.1 minuts


epoch [15/33], loss=2.2250:  11%|█████▋                                               | 25/232 [00:00<00:06, 31.52it/s]

In [33]:
plt.plot(list(range(num_epochs)), result_train_acc)
plt.title("train accuracy")
plt.xlabel("epochs")
plt.ylabel("accuracy percentage")
plt.show()

NameError: name 'result_train_acc' is not defined

In [40]:
plt.plot(list(range(num_epochs)), result_test_acc)
plt.title("test accuracy")
plt.xlabel("epochs")
plt.ylabel("accuracy percentage")
plt.show()

In [41]:
d_test = xtest_day
dt = d_test.copy(deep=True)
d_test = torch.tensor(d_test.values.astype(np.float32))
preds = model(d_test.to(device=device)).argmax(dim=1).cpu()
dt["predicted_days_group_index"] = preds
dt

In [42]:
days_tensor = torch.tensor(d_test[["weekday_name", "month", "output_year", "leap_year_condition"]].values.astype(np.float32))
preds = model(days_tensor.to(device=device))


In [None]:
torch.save(model, "../data/day_model_saved")
model = torch.load("../data/day_model_saved")

In [None]:
dt.to_csv("../data/day_predections.csv", header=True, index=True)