In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Read the csv file**

In [None]:
df = pd.read_csv("/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")
df.head()

**Find number of columns and rows**

In [None]:
df.shape

It means there are 1599 rows and 12 columns

**Now plot graph to see it's distribution and property**

In [None]:
import matplotlib.pyplot as plt
def plot_figure(index,column):
    plt.subplot(6,2,index)
    plt.title(column)
    plt.plot(df[column])
    
plt.figure(figsize=(10,10))

for index , column in enumerate(df.columns):
    if index+1<=len(df.columns):
        plot_figure(index+1, column)

plt.tight_layout()

**Check datatypes of dataframe's columns**

In [None]:
df.dtypes

**Checking if there is any missing value exists**

In [None]:
df.isnull().any()

**As every column name returned false it means that there is not any null value.**

**check correlation between features**

In [None]:
import seaborn as sns
#correlation matrix
corrmat = df.corr()
k = 12 #number of variables for heatmap
cols = corrmat.nlargest(k, 'quality')['quality'].index
cm = np.corrcoef(df[cols].values.T)
sns.set(font_scale=1)
plt.figure(figsize=(8,8))
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

I have considered highly correlated if correaltion value is >0.7 

In [None]:
input_cols = list(df.columns)[:-1]
input_cols

In [None]:
output_cols = ['quality']

**Convert dataframe to numpy arrays**

In [None]:
def dataframe_to_arrays(df):
    # Make a copy of the original dataframe
    df1 = df.copy(deep=True)
    # Extract input & outupts as numpy arrays
    inputs_array = df1[input_cols].to_numpy()
    targets_array = df1[output_cols].to_numpy()
    return inputs_array, targets_array

In [None]:
inputs_array, targets_array = dataframe_to_arrays(df)
inputs_array, targets_array

In [None]:
inputs_array.shape,targets_array.shape

**Convert numpy array to torch tensor**

In [None]:
import torch
inputs = torch.Tensor(inputs_array)
targets = torch.Tensor(targets_array)

**Next, we need to create PyTorch datasets & data loaders for training & validation. We'll start by creating a TensorDataset. **

In [None]:
from torch.utils.data import DataLoader, TensorDataset, random_split
dataset = TensorDataset(inputs, targets)

**Split the datasets into train ,validation and test datasets**

In [None]:
df.shape

In [None]:
num_rows = len(df)
val_percent = 0.01 # between 0.1 and 0.2
val_size = int(num_rows * val_percent)
train_size = num_rows - val_size


train_df, val_df = random_split(dataset, [train_size, val_size]) 

**Pick a batch size for data loader**

In [None]:
batch_size = 50

In [None]:
train_loader = DataLoader(train_df, batch_size, shuffle=True)
val_loader = DataLoader(val_df, batch_size)

**Create Model skeleton**

In [None]:
input_cols

In [None]:
output_cols

In [None]:
input_size = len(input_cols)
output_size = len(output_cols)

In [None]:
import torch.nn as nn
class WineModel(nn.Module):
    def __init__(self):
        super().__init__()     
        self.linear = nn.Linear(input_size, output_size) # fill this (hint: use input_size & output_size defined above)
        #model initialized with random weight
        
    def forward(self, xb):
        out = self.linear(xb)             # batch wise forwarding
        return out
    
    def training_step(self, batch):
        inputs, targets = batch 
        # Generate predictions
        out = self(inputs)         
        # Calcuate loss
        loss = F.l1_loss(out, targets)  # batch wise training step and loss
        return loss
    
    def validation_step(self, batch):
        inputs, targets = batch
        # Generate predictions
        out = self(inputs)
        # Calculate loss
        loss =F.l1_loss(out, targets)       # batch wise validation and loss    
        return {'val_loss': loss.detach()}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine val losses of all batches as average
        return {'val_loss': epoch_loss.item()}
    
    def epoch_end(self, epoch, result, num_epochs):
        # Print result every 20th epoch
        if (epoch+1) % 20 == 0 or epoch == num_epochs-1:
            print("Epoch [{}], val_loss: {:.4f}".format(epoch+1, result['val_loss']))

In [None]:
model =  WineModel()

In [None]:
list(model.parameters())

In [None]:
def evaluate(model, val_loader):
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

def fit(epochs, lr, model, train_loader, val_loader, opt_func=torch.optim.SGD):
    history = []
    optimizer = opt_func(model.parameters(), lr)
    for epoch in range(epochs):
        # Training Phase 
        for batch in train_loader:
            loss = model.training_step(batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        # Validation phase
        result = evaluate(model, val_loader)
        model.epoch_end(epoch, result, epochs)
        history.append(result)  #appends total validation loss of whole validation set epoch wise
    return history

** Use the evaluate function to calculate the loss on the validation set before training.**

In [None]:
import torchvision
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn.functional as F
from torchvision.datasets.utils import download_url


In [None]:
result = evaluate(model,val_loader) # Use the the evaluate function
print(result)

**Train the model 4-5 times with different learning rates & for different number of epochs to see what works**

In [None]:
epochs = 1000
lr = 1e-2
history1 = fit(epochs, lr, model, train_loader, val_loader)

In [None]:
epochs = 1000
lr = 1e-3
history2 = fit(epochs, lr, model, train_loader, val_loader)

In [None]:
epochs = 1000
lr = 1e-4
history3 = fit(epochs, lr, model, train_loader, val_loader)

In [None]:
epochs = 1000
lr = 1e-5
history3 = fit(epochs, lr, model, train_loader, val_loader)

In [None]:
epochs = 1000
lr = 1e-6
history3 = fit(epochs, lr, model, train_loader, val_loader)

**Now calculate final validation loss **

In [None]:
val_loss = evaluate(model,val_loader)
val_loss

**Make predictions using the trained model**

In [None]:
def predict_single(input, target, model):
    inputs = input.unsqueeze(0) 
    predictions = model(inputs)
    prediction = predictions[0].detach()
    print("Input:", input)
    print("Target:", target)
    print("Prediction:", prediction)

**Note: I am doing predictions for validation set. But ideally you should seperate some of datasaets for test. **

Here it has been done for learning purpose only.

In [None]:
input, target = val_df[0]
predict_single(input, target, model)

In [None]:
input, target = val_df[10]
predict_single(input, target, model)

In [None]:
input, target = val_df[5]
predict_single(input, target, model)

Hoorrah..... It's performing well .<br>
And this is how we trained our first pytorch model with linear regression on **Wine quality dataset.**