Wine quality through hand made simple models.

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
train = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
train.head()

### Lets start with a linear model

In [None]:
import torch

creating the simplest linear model

In [None]:

def model(t_u, w, b):
    return w * t_u + b

initialising w and b

In [None]:

params = torch.tensor([1.0, 0.0], requires_grad=True)

lets get the tensors out of training data

### getting our data

In [None]:

t_u = train.drop('quality', axis=1)
t_c = train.quality

these are our x'es - input

In [None]:
t_u

`t_c` is our target

In [None]:
t_c 

### preprocessing

`describe` function helps us to know more about our dataset. look at the mean column, we want it to be in the similar range

In [None]:
t_u.describe()

We see that that the columns are not in the same mean range so we need to normalise the data first.

using mean normalisation

In [None]:
t_un =(t_u-t_u.mean())/t_u.std()

In [None]:
t_un

In [None]:
t_un.describe()

using min max normalisation****

In [None]:
t_un2 = (t_u-t_u.min())/(t_u.max()-t_u.min())

In [None]:
t_un2.describe()

In [None]:
t_un2

I personally like min max normalisation.
Note that we are not normalising `t_c`. (should we?)
 

### creating a validation set

we need to do this before we get into the training loop

In [None]:
t_un2 = torch.tensor(t_un2.values)
t_c = torch.tensor(t_c.values)

In [None]:
n_samples = t_un2.shape[0]
n_val = int(0.2 * n_samples)

shuffled_indices = torch.randperm(n_samples)

train_indices = shuffled_indices[:-n_val]
val_indices = shuffled_indices[-n_val:]

train_indices[:5], val_indices[:5]

In [None]:
t_u_train = t_un2[train_indices]
t_c_train = t_c[train_indices]

t_u_val = t_un2[val_indices]
t_c_val = t_c[val_indices]

### creating tensors

Next step is to convert `t_un2` and `t_c` into tensors of right format to be used as input to the linear model.


In [None]:

t_u_train = t_u_train.unsqueeze(1)
t_c_train = t_c_train.unsqueeze(1)

t_u_val = t_u_val.unsqueeze(1)
t_c_val = t_c_val.unsqueeze(1)

lets ensure all of the tensors are float now.

In [None]:
t_u_train, t_u_val = t_u_train.type(torch.FloatTensor), t_u_val.type(torch.FloatTensor)
t_c_train, t_c_val = t_c_train.type(torch.FloatTensor), t_c_val.type(torch.FloatTensor)

In [None]:
t_u_train.shape, t_u_val.shape

In [None]:
t_c_train.shape, t_c_val.shape

one last thing would be that the input dimension is different from the target dimension. usually pytorch will give a broadcasting warning for this, but lets decide to use the tensor as this.

So our preprocessing of tensors is complete.

### Now comes the training loop.

In [None]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-2
optimizer = torch.optim.SGD([params], lr=learning_rate)

In [None]:
def training_loop(n_epochs=3000, optimizer= optimizer, params=params,train_t_u= t_u_train, val_t_u=t_u_val, train_t_c=t_c_train, val_t_c=t_c_val, loss_fn=torch.nn.MSELoss(),model=model):
    for epoch in range(1, n_epochs+1):
        train_t_p = model(train_t_u, *params)
        train_loss = loss_fn(train_t_p, train_t_c)
        
        val_t_p = model(val_t_u, *params)
        val_loss = loss_fn(val_t_p, val_t_c)
        
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        
        if epoch <= 3 or epoch % 500 == 0:
            print("Epoch: ", epoch, " Training loss: ", train_loss.item(), " Val loss ", val_loss.item())
    return params
    

In [None]:
optimal_params = training_loop()
%time

It does take a lot of time. But lets try using a neural network next.

In [None]:
seq_model = torch.nn.Sequential(
    torch.nn.Linear(11,13),
    torch.nn.Tanh(),
    torch.nn.Linear(13,1),
)

seq_model

lets look at the parameters

In [None]:
[param.shape for param in seq_model.parameters()]

Lets check all the sizes of parameters first.

In [None]:
t_u_train.shape, t_c_train.shape, t_u_val.shape, t_c_val.shape

In [None]:
def training_loop(n_epochs=3000, optimizer= optimizer, params=params,train_t_u= t_u_train, val_t_u=t_u_val, train_t_c=t_c_train, val_t_c=t_c_val, loss_fn=torch.nn.MSELoss(),model=model):
    for epoch in range(1, n_epochs+1):
#         train_t_p = model(train_t_u, *params)
        train_t_p = model(train_t_u)
        train_loss = loss_fn(train_t_p, train_t_c)
        
#         val_t_p = model(val_t_u, *params)
        val_t_p = model(val_t_u)
        val_loss = loss_fn(val_t_p, val_t_c)
        
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        
        if epoch <= 3 or epoch % 500 == 0:
            print("Epoch: ", epoch, " Training loss: ", train_loss.item(), " Val loss ", val_loss.item())
#     return params
    

In [None]:
optimizer = torch.optim.SGD(seq_model.parameters(), lr=1e-3)

training_loop(
    n_epochs=5000,
    optimizer=optimizer,
    model = seq_model,
    loss_fn=torch.nn.MSELoss(),
)

%time

In [None]:
print('output', seq_model(t_u_val))

In [None]:
print('answer', t_c_val)

Lets compare the two models

In [None]:
from matplotlib import pyplot as plt

t_range = torch.arange(20., 90.).unsqueeze(1)
fig = plt.figure(dpi=600)

try: 
#     plt.xlabel("X")
#     plt.ylabel("y")
#     plt.plot(t_u_train.numpy(), t_c_train.numpy(), 'o')
    plt.plot(t_range.numpy(), seq_model(0.1 * t_range).detach().numpy(), 'c-')
    plt.plot(t_u_train.numpy(), seq_model(0.1 * t_u_train).detach().numpy(), 'kx')
except Exception as e:
    print(e)

It will be interesting if we finda way to plot all 11 values of x. Lets make some predictions.

In [None]:
len(t_u_val)

In [None]:
t_u_val[289]

Lets see what linear model gives

In [None]:
optimal_params

In [None]:
model(t_c_val[289], *optimal_params)

Now the neural net model

In [None]:
seq_model(t_u_val[289])

In [None]:
t_c_val[289]