# Objective
The objective of this notebook is to build a Regression model in Pytorch and make sure it words by comparing it with Sklearn's regression model. Once this works, we can set this up as a baseline and  we can do better by:
1. Improving the model architecture (for example by adding more layers)
2. Changing the Input features
3. Express our creativity by defining a novel model architecture

In [None]:
import os
import spacy
import torch
import numpy as np
import pandas as pd

from torch import nn
from pathlib import Path
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge

In [None]:
BATCH_SIZE = 64
RANDOM_STATE = 41
FEATURES_SIZE = 300
nlp = spacy.load('en_core_web_lg')

In [None]:
COMPETITION_DATA_PATH = Path('../input/commonlitreadabilityprize')
TRAIN_DATA_PATH = COMPETITION_DATA_PATH / 'train.csv'
TEST_DATA_PATH = COMPETITION_DATA_PATH / 'test.csv'

In [None]:
train_data = pd.read_csv(TRAIN_DATA_PATH)
test_data = pd.read_csv(TEST_DATA_PATH)
train_data, valid_data = train_test_split(train_data, test_size=0.1, random_state=RANDOM_STATE)

print(f'Length of train data: {len(train_data)}')
print(f'Length of valid data: {len(valid_data)}')
print(f'Length of test data: {len(test_data)}')

# Spacy Feature extraction
All Credits to Sumit Kumar @anaverageengineer https://www.kaggle.com/anaverageengineer/comlrp-baseline-for-complete-beginners

In [None]:
def create_features(text_excerpts):
    with nlp.disable_pipes():
        features = np.vstack([nlp(text).vector for text in tqdm(text_excerpts)])
    return features

def create_targets(targets):
    targets = targets.reshape(-1, 1).astype(np.float32)
    return targets


X_train = create_features(train_data['excerpt'].tolist())
y_train = create_targets(train_data['target'].to_numpy())
X_valid = create_features(valid_data['excerpt'].tolist())
y_valid = create_targets(valid_data['target'].to_numpy())

print(f'Shapes: X_train {X_train.shape}, X_valid: {X_valid.shape}, y_train: {y_train.shape}, y_valid: {y_valid.shape}')

# Datasets and Dataloaders definition

In [None]:
class TrainingDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]
    

class PredictionDataset(Dataset):
    def __init__(self, text_excerpts):
        self.text_excerpts = text_excerpts
    
    def __len__(self):
        return len(self.text_excerpts)
    
    def __getitem__(self, idx):
        with nlp.disable_pipes():
            text = self.text_excerpts[idx]
        X = nlp(text).vector
        return X

# Model definition

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.linear = nn.Sequential(
            nn.Linear(FEATURES_SIZE, FEATURES_SIZE),
            nn.Dropout(p=0.2),
            nn.ReLU(),
            nn.Linear(FEATURES_SIZE, 1), 
        )  
    def forward(self, x):
        x = self.linear(x)
        return x

# Create datasets and dataloaders

In [None]:
train_dataset = TrainingDataset(features=X_train, targets=y_train)
valid_dataset = TrainingDataset(features=X_valid, targets=y_valid)
test_dataset = PredictionDataset(text_excerpts=test_data['excerpt'])

train_dataloader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = DataLoader(dataset=valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Create Model, Optimizer and Loss function

In [None]:
loss_fn = nn.MSELoss(reduction='mean')
model = Model()
optimizer = torch.optim.SGD(params=model.parameters(), lr=0.01, momentum=0.9, nesterov=True)

# Training and Evaluation loop

In [None]:
def train_one_epoch(dataloader, model, optimizer):
    model.train()
    total_loss = 0
    for batch_num, batch in enumerate(dataloader):
        # Forward pass
        X, y = batch
        y_pred = model(X)
        loss = loss_fn(y, y_pred)
        total_loss += np.sqrt(loss.item())
        # Backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    average_loss = total_loss / (batch_num + 1)
    return average_loss

def validate_one_epoch(dataloader, model):
    model.eval()
    with torch.no_grad():
        total_loss = 0
        for batch_num, batch in enumerate(dataloader):
            # Forward pass
            X, y = batch
            y_pred = model(X)
            loss = loss_fn(y, y_pred)
            total_loss += np.sqrt(loss.item())
        average_loss = total_loss / (batch_num + 1)
    return average_loss

def predict(dataloader, model):
    model.eval()
    with torch.no_grad():
        y_preds = []
        for batch_num, X in enumerate(dataloader):
            y_pred = model(X)
            y_preds.append(y_pred.cpu().detach().numpy())
    y_preds = np.vstack(y_preds)
    return y_preds

In [None]:
class EarlyStopping:
    def __init__(self, patient_epochs=2):
        self.best_valid_loss = np.inf
        self.best_epoch = -1
        self.patient_epochs = patient_epochs
    
    def should_stop(self, current_epoch, current_valid_loss):
        if current_valid_loss < self.best_valid_loss:
            self.best_valid_loss = current_valid_loss
            self.best_epoch = current_epoch
        return True if current_epoch > self.best_epoch + self.patient_epochs else False

early_stopping = EarlyStopping(patient_epochs=50)

In [None]:
for epoch_num in range(500):
    train_loss = train_one_epoch(dataloader=train_dataloader, model=model, optimizer=optimizer)
    valid_loss = validate_one_epoch(dataloader=valid_dataloader, model=model)
    if early_stopping.should_stop(current_epoch=epoch_num, current_valid_loss=valid_loss):
        print(f'Exiting At epoch: {epoch_num}, train_loss: {train_loss}, valid_loss: {valid_loss}')
        break
    if epoch_num % 50 == 0:
        print(f'At epoch: {epoch_num}, train_loss: {train_loss}, valid_loss: {valid_loss}')

# Comparison with Sklearn to make sure the model is sensible

In [None]:
regressor = Ridge().fit(X_train, y_train)
sklearn_error = mean_squared_error(regressor.predict(X_valid), y_valid)
print(f'Sklearn Error: {sklearn_error: .3f}')

In [None]:
valid_test_dataset = PredictionDataset(text_excerpts=valid_data['excerpt'].tolist())
valid_test_dataloader = DataLoader(dataset=valid_test_dataset, batch_size=BATCH_SIZE, shuffle=False)
pytorch_error = mean_squared_error(predict(valid_test_dataloader, model), valid_data['target'].tolist())
print(f'Pytorch Error: {pytorch_error: .3f}')

# Make submission

In [None]:
test_dataset = PredictionDataset(text_excerpts=test_data['excerpt'].tolist())
test_dataloader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_data['target'] = predict(test_dataloader, model)
test_data[['id','target']].to_csv('submission.csv', index=False)