In [60]:
from os import PathLike
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [38]:
torch.set_default_dtype(torch.float32)

In [13]:
train_csv = pd.read_csv("data/train.csv")
train_csv

Unnamed: 0,id,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds,yield
0,0,25.0,0.50,0.25,0.75,0.50,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.425011,0.417545,32.460887,4476.81146
1,1,25.0,0.50,0.25,0.50,0.50,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.444908,0.422051,33.858317,5548.12201
2,2,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.552927,0.470853,38.341781,6869.77760
3,3,12.5,0.25,0.25,0.63,0.50,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.565976,0.478137,39.467561,6880.77590
4,4,25.0,0.50,0.25,0.63,0.63,77.4,46.8,64.7,55.8,27.0,45.8,24.0,0.39,0.579677,0.494165,40.484512,7479.93417
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15284,15284,12.5,0.25,0.25,0.38,0.50,77.4,46.8,64.7,55.8,27.0,45.8,16.0,0.26,0.556302,0.476308,40.546480,7667.83619
15285,15285,12.5,0.25,0.25,0.25,0.50,86.0,52.0,71.9,62.0,30.0,50.8,34.0,0.56,0.354413,0.388145,29.467434,3680.56025
15286,15286,25.0,0.50,0.25,0.38,0.75,77.4,46.8,64.7,55.8,27.0,45.8,34.0,0.56,0.422548,0.416786,32.299059,4696.44394
15287,15287,25.0,0.50,0.25,0.63,0.63,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.542170,0.434133,36.674243,6772.93347


In [3]:
test_csv = pd.read_csv("data/test.csv")
test_csv

Unnamed: 0,id,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds
0,15289,25.0,0.25,0.25,0.25,0.25,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.399367,0.408088,31.394569
1,15290,12.5,0.25,0.25,0.75,0.63,94.6,57.2,79.0,68.2,33.0,55.9,1.0,0.10,0.488048,0.442866,36.846956
2,15291,12.5,0.25,0.25,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.583379,0.487057,40.037644
3,15292,25.0,0.50,0.38,0.38,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.433014,0.422847,33.116091
4,15293,37.5,0.75,0.25,0.25,0.25,94.6,57.2,79.0,68.2,33.0,55.9,24.0,0.39,0.360996,0.388860,29.558019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10189,25478,25.0,0.25,0.25,0.25,0.25,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.474162,0.437923,34.525258
10190,25479,25.0,0.50,0.25,0.50,0.75,77.4,46.8,64.7,55.8,27.0,45.8,16.0,0.26,0.482854,0.440676,35.648221
10191,25480,25.0,0.50,0.38,0.50,0.50,77.4,46.8,64.7,55.8,27.0,45.8,16.0,0.26,0.568854,0.463065,37.724724
10192,25481,12.5,0.25,0.25,0.38,0.50,94.6,57.2,79.0,68.2,33.0,55.9,34.0,0.56,0.407374,0.409261,31.881847


In [84]:
class BlueberryDataset(Dataset):
    def __init__(self, csv_file: PathLike, test: bool = False):
        csv_data = pd.read_csv(f"data/{csv_file}")
        csv_data = csv_data[:len(csv_data) // 5] if test else csv_data[len(csv_data)//5:]
        csv_data = csv_data.drop(columns="id")
        self.labels = csv_data.pop("yield").to_numpy()
        self.data = csv_data.to_numpy()

    def __len__(self) -> int:
        return len(self.data)
    
    def __getitem__(self, idx: int) -> tuple:
        return (torch.tensor(self.data[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.float32))

In [90]:
train_dataset = BlueberryDataset("train.csv")
test_dataset = BlueberryDataset("train.csv", test=True)
print((len(train_dataset)), len(test_dataset))
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

12232 3057


In [91]:
def create_layer_sequence(layer_dimensions: list[tuple[int]]) -> list[nn.Module]:
    layers = []
    for i in range(len(layer_dimensions)-1):
        layers += [
            nn.Linear(layer_dimensions[i], layer_dimensions[i+1]),
            nn.ReLU(),
            nn.Dropout(),
        ] if i < len(layer_dimensions) - 2 else [
            nn.Linear(layer_dimensions[i], layer_dimensions[i+1])
        ]
    return layers

In [93]:
model = torch.nn.Sequential(
    *create_layer_sequence([
        16,
        10,
        20,
        5,
        1
    ])
)

model

Sequential(
  (0): Linear(in_features=16, out_features=10, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=10, out_features=20, bias=True)
  (4): ReLU()
  (5): Dropout(p=0.5, inplace=False)
  (6): Linear(in_features=20, out_features=5, bias=True)
  (7): ReLU()
  (8): Dropout(p=0.5, inplace=False)
  (9): Linear(in_features=5, out_features=1, bias=True)
)

In [94]:
epochs = 1000
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(params=model.parameters())


for epoch in range(epochs):
    train_losses = []
    for (samples, labels) in train_loader:
        labels = labels.to(dtype=torch.float32)
        samples = samples.to(dtype=torch.float32)
        optimizer.zero_grad()
        out = model(samples)
        loss = criterion(out, labels)
        print(loss)
        loss.backward()
        optimizer.step()


  return F.l1_loss(input, target, reduction=self.reduction)


tensor(6102.9385, grad_fn=<MeanBackward0>)
tensor(5958.7031, grad_fn=<MeanBackward0>)
tensor(6066.7124, grad_fn=<MeanBackward0>)
tensor(5985.6328, grad_fn=<MeanBackward0>)
tensor(6210.6963, grad_fn=<MeanBackward0>)
tensor(5935.1250, grad_fn=<MeanBackward0>)
tensor(6077.3042, grad_fn=<MeanBackward0>)
tensor(5991.6636, grad_fn=<MeanBackward0>)
tensor(6033.8604, grad_fn=<MeanBackward0>)
tensor(6016.7310, grad_fn=<MeanBackward0>)
tensor(6002.6689, grad_fn=<MeanBackward0>)
tensor(6028.0747, grad_fn=<MeanBackward0>)
tensor(5945.2148, grad_fn=<MeanBackward0>)
tensor(5968.4307, grad_fn=<MeanBackward0>)
tensor(5880.7871, grad_fn=<MeanBackward0>)
tensor(6107.9775, grad_fn=<MeanBackward0>)
tensor(6119.6406, grad_fn=<MeanBackward0>)
tensor(5956.5088, grad_fn=<MeanBackward0>)
tensor(6017.3184, grad_fn=<MeanBackward0>)
tensor(6061.5117, grad_fn=<MeanBackward0>)
tensor(6068.6934, grad_fn=<MeanBackward0>)
tensor(6061.9121, grad_fn=<MeanBackward0>)
tensor(5947.8940, grad_fn=<MeanBackward0>)
tensor(5989

  return F.l1_loss(input, target, reduction=self.reduction)


tensor(6105.6821, grad_fn=<MeanBackward0>)
tensor(6034.8721, grad_fn=<MeanBackward0>)
tensor(5911.4194, grad_fn=<MeanBackward0>)
tensor(6001.0298, grad_fn=<MeanBackward0>)
tensor(6087.5430, grad_fn=<MeanBackward0>)
tensor(6101.2866, grad_fn=<MeanBackward0>)
tensor(6026.0283, grad_fn=<MeanBackward0>)
tensor(6096.0483, grad_fn=<MeanBackward0>)
tensor(6104.2666, grad_fn=<MeanBackward0>)
tensor(5944.7388, grad_fn=<MeanBackward0>)
tensor(6108.6348, grad_fn=<MeanBackward0>)
tensor(5970.6777, grad_fn=<MeanBackward0>)
tensor(6039.6587, grad_fn=<MeanBackward0>)
tensor(6040.9678, grad_fn=<MeanBackward0>)
tensor(5928.7466, grad_fn=<MeanBackward0>)
tensor(6014.8384, grad_fn=<MeanBackward0>)
tensor(5987.3716, grad_fn=<MeanBackward0>)
tensor(5879.5913, grad_fn=<MeanBackward0>)
tensor(6087.5034, grad_fn=<MeanBackward0>)
tensor(6018.5586, grad_fn=<MeanBackward0>)
tensor(6051.6279, grad_fn=<MeanBackward0>)
tensor(5983.8398, grad_fn=<MeanBackward0>)
tensor(6045.1138, grad_fn=<MeanBackward0>)
tensor(6074