In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import torchvision
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
train = pd.read_csv('Train.csv') #input data
print(train.shape)
train.head()

(2977, 4)


Unnamed: 0,Field_ID,Year,Quality,Yield
0,MH2O0YH,2019,3,3.686
1,O9TURWL,2019,2,5.657
2,35AFSDD,2019,3,3.082
3,PM05EG9,2019,2,2.707
4,V7PZBCG,2019,2,2.679


In [3]:
train['Quality'].value_counts()

3    1321
2    1231
1     425
Name: Quality, dtype: int64

In [4]:
#train = train[train.Quality > 1] #drop field with quality 1
#train['Quality'].value_counts()

In [5]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN,self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(                  
                in_channels=168,  #input size: (168,9,9)
                out_channels=16,#size after conv: (16,9,9)        
                kernel_size=3,
                stride=1,       
                padding=1
            ),
            nn.BatchNorm2d(16), #Standardize the 16 results
            nn.ReLU(), #Activation function
            nn.MaxPool2d(kernel_size=3) # size(168,3,3)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels = 16,#(16,3,3) 
                out_channels = 32,#(32,3,3)
                kernel_size = 3,
                stride = 1,
                padding = 1
            ),
            nn.BatchNorm2d(32),#Standardize the 32 results
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3)#(32,1,1)
        )
       
        self.drop_out = nn.Dropout() #Prevent overfitting
        
        self.fc1 = nn.Linear(1*1*32,100)
        self.fc2 = nn.Linear(100,1)
        
    
    def forward(self,x):
        x = self.conv1(x)                     
        x = self.conv2(x)   
        x = x.view(x.size(0),-1) 
        out = self.drop_out(x)
        out = self.fc1(x) 
        out = self.fc2(out) 
        return out

In [6]:
cnn = CNN()
print(cnn)

CNN(
  (conv1): Sequential(
    (0): Conv2d(168, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (drop_out): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=32, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=1, bias=True)
)


In [7]:
training_data = np.load("image_arrays_train/BKWGKCN.npy")
x_train = training_data.view('int32')

print(x_train.dtype)

int32


In [8]:
def get_image_center(fid, folder='image_arrays_train'):
    fn = f'{folder}/{fid}.npy'
    arr = np.load(fn) # Loading the data with numpy  
    
    arr1 =[]
    for i in range(len(arr)):
        if(i%30 <14):         #choose first 14 bands
        # if(i%30==1 or i%30 ==2 or i%30==3):      #choose RGB bands
            arr1.append(arr[i])     
    arr2 = np.array(arr1)  #turn list to nparray
    arr_center = arr2[:,17:26,17:26] #9 * 9matrix
    return arr_center.view('int32')

#looading train data and target
X_train = [get_image_center(fid) for fid in train['Field_ID'].values]
y_train = [val for val in train['Yield'].values]

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.1, random_state= 42) #split train data

In [10]:
X = torch.Tensor([each for each in X_train]) # convert data to tensor 
y = torch.Tensor(y_train)
X_test = torch.Tensor([each for each in X_test])
y_test = torch.Tensor(y_test)

In [11]:
print(X[0].shape,y[0])

torch.Size([168, 9, 9]) tensor(1.0438)


In [12]:
inputs = Variable(X)
outputs = Variable(y)
input_test= Variable(X_test)
output_test = Variable(y_test)

In [13]:
EPOCH = 100
LR = 0.01

loss_func = nn.MSELoss() # this is for regression mean squared loss
optimizer = torch.optim.Adam(cnn.parameters(), lr = LR)

#simple cnn training
for epoch in range(EPOCH): 
    pred_y = cnn(inputs)
    loss = loss_func(pred_y[:,0],outputs)
    optimizer.zero_grad()#Clear the last gradient
    loss.backward()# Backpropagation
    optimizer.step()# Optimizer parameter update
    
    if (epoch+1)%10 == 0: 
        test_output = cnn(input_test)
        print(epoch+1,loss_func(test_output[:,0],output_test))

10 tensor(3.0038, grad_fn=<MseLossBackward>)
20 tensor(2.9087, grad_fn=<MseLossBackward>)
30 tensor(2.8847, grad_fn=<MseLossBackward>)
40 tensor(2.7944, grad_fn=<MseLossBackward>)
50 tensor(2.7220, grad_fn=<MseLossBackward>)
60 tensor(2.6891, grad_fn=<MseLossBackward>)
70 tensor(2.6728, grad_fn=<MseLossBackward>)
80 tensor(2.6448, grad_fn=<MseLossBackward>)
90 tensor(2.6250, grad_fn=<MseLossBackward>)
100 tensor(2.6064, grad_fn=<MseLossBackward>)


In [14]:
def get_image_center(fid, folder='image_arrays_test'):
    fn = f'{folder}/{fid}.npy'
    arr = np.load(fn) # Loading the data with numpy  
    arr1 =[]
    for i in range(len(arr)):
        if(i%30 <14):        #choose first 14 bands
            arr1.append(arr[i])     
    arr2 = np.array(arr1)
    arr_center = arr2[:,17:26,17:26] # 9 * 9matrix
    return arr_center.view('int32')


In [15]:
ss = pd.read_csv('SampleSubmission.csv')
test = [get_image_center(fid) for fid in ss['Field_ID'].values]
X1 = torch.Tensor([each for each in test])
preds = cnn(X1)
preds1 = preds.detach().numpy()
preds2 = preds1.flatten()
print(preds2)
ss['Yield'] = preds2
ss.to_csv('Sub.csv', index=False)
ss.head()

[1.8853983 2.5636697 2.9191582 ... 2.1713548 2.6837459 3.3167365]


Unnamed: 0,Field_ID,Yield
0,E9UZCEA,1.885398
1,1WGGS1Q,2.56367
2,EG2KXE2,2.919158
3,HC3GQXF,2.674145
4,7AK6GFK,2.613728
