In [181]:

import os
import sys

# Import data cleaning libraries
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error


# Import machine learning libraries
import torch
from torch import nn
from sklearn.model_selection import train_test_split

# Import data visualisation libraries
import matplotlib.pyplot as plt

# Import warning libraries
import warnings
warnings.filterwarnings("ignore")

# Set working directory
# Set this to your own path
os.chdir('/home/shaw/Documents/GitHub/crop-yield-estimate/')
# Set this to your own path
sys.path.insert(0, '/home/shaw/Documents/GitHub/crop-yield-estimate/pipeline')

# Import preprocessing libraries
# Import system libraries
from preprocessing import dim_reduction
from preprocessing import feature_selection
from preprocessing import scaling
from preprocessing import feature_engineering
from preprocessing import cleaning
# Preprocess data
train_path = "data/Train.csv"
test_path = "data/Test.csv"
df = cleaning.clean_data(train_path, test_path)
df = feature_engineering.get_features(df)
df = scaling.scale_features(df)
df = feature_selection.select_features(df)
df = dim_reduction.reduce_dim(df)


# Split data into training and test sets
df_train = df[df['Yield'].isna() == False]
df_test = df[df['Yield'].isna() == True]

In [182]:
df_train.dropna(axis=1, inplace=True)
df_test.dropna(axis=1, inplace=True)

In [183]:
df_train = df_train.astype('float32')
df_test = df_test.astype('float32')

In [184]:
# Split data
X, y = df_train.drop(["Yield", "Yield_per_Acre"], axis=1), df_train["Yield"]

X_data = torch.tensor(X.values, dtype=torch.float32)
y_data = torch.tensor(y.values, dtype=torch.float32)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_data, y_data, test_size=0.2, random_state=42)

In [185]:
# Verify the sizes of the data
print("X_train size:", X_train.shape)
print("X_test size:", X_test.shape)
print("y_train size:", y_train.shape)
print("y_test size:", y_test.shape)


X_train size: torch.Size([3096, 178])
X_test size: torch.Size([774, 178])
y_train size: torch.Size([3096])
y_test size: torch.Size([774])


In [186]:
class CropYeild(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(178, 64)
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, 96)
        self.fc4 = nn.Linear(96, 32)
        self.fc5 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.34)
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.relu(self.fc4(x))
        x = self.dropout(x)
        x = self.fc5(x)
        return x


In [187]:
model = CropYeild()

In [188]:
alpha = 0.008 # Idk random number - i'LL experiement
criteria = nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=alpha)

In [189]:
# Forwards pass
y_pred = model(X_data)

print("y_pred size:", y_pred.shape)
print("y_data size:", y_data.shape)


y_pred size: torch.Size([3870, 1])
y_data size: torch.Size([3870])


In [190]:
print('pred:', y_pred[:50])

pred: tensor([[ 1.2383e+01],
        [ 3.2352e+00],
        [ 2.1234e+00],
        [-1.4184e+00],
        [ 9.4228e-01],
        [ 1.7749e+00],
        [ 3.4008e+00],
        [ 5.9613e+00],
        [-3.1399e+00],
        [ 1.0380e+01],
        [ 5.2092e+00],
        [-3.1026e+00],
        [ 5.3408e+00],
        [-4.1729e+00],
        [ 5.7953e+00],
        [ 1.0463e+01],
        [ 2.7130e+00],
        [ 5.0424e+00],
        [ 6.8812e-01],
        [ 5.6233e+00],
        [-5.0125e-01],
        [ 1.0304e+00],
        [ 6.6252e+00],
        [-8.7595e-01],
        [ 1.0978e-02],
        [-1.5299e+00],
        [ 1.3786e+00],
        [ 3.4024e+00],
        [ 5.1611e+00],
        [ 6.6624e+00],
        [ 1.1139e+00],
        [ 8.1915e+00],
        [ 2.0676e-01],
        [-2.7609e+00],
        [ 1.4617e+00],
        [ 4.1796e+00],
        [-1.9105e+00],
        [ 4.4911e+00],
        [ 6.3836e-01],
        [ 3.9360e+00],
        [-3.1680e+00],
        [ 5.1237e+00],
        [ 1.0770e+01],
     

In [192]:
epochs = 2000

epoch_count, train_loss, test_loss = [], [], []

for epoch in range(epochs):

    model.train()

    y_pred = model(X_train).squeeze()

    loss_value = torch.sqrt(criteria(y_pred, y_train))

    optimizer.zero_grad()

    loss_value.backward()

    optimizer.step()

    model.eval()

    with torch.inference_mode():

        y_pred_test = model(X_test).squeeze()

        loss_value_test = torch.sqrt(criteria(y_pred_test, y_test))

    train_loss.append(loss_value.item())

    if epoch % int(epochs / 100) == 0:
        print(f'Epoch: {epoch:4.0f} | Train Loss: {loss_value:.5f}, | Test Loss: {loss_value_test:.5f}')
        epoch_count.append(epoch)
        train_loss.append(loss_value.detach().numpy())
        test_loss.append(loss_value_test.detach().numpy())



Epoch:    0 | Train Loss: 764.90906, | Test Loss: 1072.25647
Epoch:   20 | Train Loss: 412.22363, | Test Loss: 804.39618
Epoch:   40 | Train Loss: 375.18805, | Test Loss: 805.98010
Epoch:   60 | Train Loss: 371.41504, | Test Loss: 807.50635
Epoch:   80 | Train Loss: 366.87921, | Test Loss: 815.73602
Epoch:  100 | Train Loss: 367.99585, | Test Loss: 819.67157
Epoch:  120 | Train Loss: 364.01865, | Test Loss: 817.71521
Epoch:  140 | Train Loss: 356.12604, | Test Loss: 808.17993
Epoch:  160 | Train Loss: 367.42111, | Test Loss: 836.49323
Epoch:  180 | Train Loss: 360.25235, | Test Loss: 830.50262
Epoch:  200 | Train Loss: 353.94003, | Test Loss: 829.94159
Epoch:  220 | Train Loss: 352.80417, | Test Loss: 830.16235
Epoch:  240 | Train Loss: 347.90887, | Test Loss: 827.15332
Epoch:  260 | Train Loss: 350.21796, | Test Loss: 845.01276
Epoch:  280 | Train Loss: 352.09573, | Test Loss: 827.91931
Epoch:  300 | Train Loss: 335.41864, | Test Loss: 829.99115
Epoch:  320 | Train Loss: 346.82178, | 