# Pytorch SETI Classifier

In [None]:
'''Load librarires'''
import pickle
import time
import random
import glob
import os
from copy import deepcopy
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from pathlib import Path
from collections import defaultdict

import matplotlib.pyplot as plt
import plotly.express as px

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
import torchvision
from torchvision import models, transforms, utils

import albumentations as A
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_curve, roc_auc_score

In [None]:
class CFG:

    '''Store all hyperparameters here.''' 
    
#     DEBUG=False

    SEED = 420
#     TEST_SIZE = 0.2
    VAL_SIZE = 0.1
    CLASSES = None #Need to update manually
    OUTPUT_FEATURES = None #Need to update manually
    
    #Transforms
    TRAIN_TRANSFORMS = A.Compose([
        A.Resize(224, 224),
        ToTensorV2(),
        ])
    VAL_TRANSFORMS = A.Compose([
        A.Resize(224, 224),
        ToTensorV2(),
    ])
    TEST_TRANSFORMS = A.Compose([
        A.Resize(224, 224),
        ToTensorV2(),
    ])
    
    #model
    MODEL1 = {
        'name': 'resnet18',
        'transfer': True,
        'architecture': models.resnet18(pretrained=True), # ResNet18
        'criterion': nn.CrossEntropyLoss(),
        'optimizer': optim.Adam,
        'weight_decay': 1e-6,
        'lr': 1e-4,
        'history': None
    }

    BATCH_SIZE = 192
    EPOCHS = 3
    
    DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print('You are using ->', DEVICE)    

In [None]:
def seed_everything(seed):
    '''Make the results reproducible'''
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True 

seed_everything(CFG.SEED)

## Metadata

In [None]:
train = pd.read_csv('../input/seti-breakthrough-listen/train_labels.csv')
sub = pd.read_csv('../input/seti-breakthrough-listen/sample_submission.csv')

In [None]:
'''Store data paths and their labels in pandas dataframe. Will be used to create pytorch datasets. '''

train_paths = glob.glob('../input/seti-breakthrough-listen/train/*/*' )
meta = pd.DataFrame(sorted(train_paths),columns=['path'])

#assign id and target to from the train df
meta['id'], meta['target'] = train.id, train.target

# get class mappings
classes = dict(enumerate(meta.target.astype('category').cat.categories))
CFG.CLASSES = classes
CFG.OUTPUT_FEATURES = len(CFG.CLASSES)

meta.head()

In [None]:
#get test data paths
test_paths = glob.glob('../input/seti-breakthrough-listen/test/*/*' )
test = pd.DataFrame(sorted(test_paths),columns=['path'])
test['target'] = 0 #dummy targets

test.head()

In [None]:
'''Split data into train, validation and test sets'''

X = list(meta.path)
y = list(meta.target)

X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y,
                                                  test_size=CFG.VAL_SIZE,
                                                  random_state=CFG.SEED,
                                                  stratify=y) #stratified split

X_test,y_test = test.path, test.target


print(f'Train length -> {len(X_train)}')
print(f'Val length -> {len(X_val)}')
print(f'Test length -> {len(X_test)}')

In [None]:
'''Custom pytorch dataset implementation.'''
class SETIDataset(Dataset):
    def __init__(self,X,y, transform=None):
        self.X = X
        self.y = y
        self.transform = transform

        assert len(self.X) == len(self.y), f'X and y have different lengths -> {len(self.X)} != {len(self.y)} '

    def __len__(self):
        return len(self.X)

    def __getitem__(self,idx):
        img_path = self.X[idx]
        img = np.load(img_path)
        img = img.astype(np.float32)
        img = np.vstack(img).transpose((1, 0))
        if self.transform is not None:
            img = self.transform(image=img)['image']
        label = torch.tensor(self.y[idx],dtype=torch.long)
        return (img, label)
    
    def show_img(self,idx):
        '''Plot image'''
        img,label = self.__getitem__(idx)
        img = img.numpy().transpose((1, 2, 0))
        plt.figure(figsize=(16, 8))
        plt.axis('off')
        plt.imshow(img)
        plt.title(CFG.CLASSES[int(label)]) #using CFG.CLASSES dict
        plt.pause(0.001)

In [None]:
'''Instantiate pytorch train, validation and test sets'''
TRAIN = SETIDataset(X_train,y_train, CFG.TRAIN_TRANSFORMS)
VAL = SETIDataset(X_val,y_val, CFG.VAL_TRANSFORMS)
TEST = SETIDataset(X_test,y_test, CFG.TEST_TRANSFORMS)

'''Instantiate Dataloaders'''
TRAIN_LOADER = DataLoader(TRAIN,CFG.BATCH_SIZE)
VAL_LOADER = DataLoader(VAL,CFG.BATCH_SIZE)
TEST_LOADER = DataLoader(TEST,CFG.BATCH_SIZE)

In [None]:
class Net(nn.Module):
    '''
    ========================
          NEURAL NET
    ========================
    
    Args:
        model_dict(dict): configuration dict containing the model architecture
        output_features(int): length of output tensor; for classification equals to number of classes
    '''
    def __init__(self, model_dict, output_features):
        super().__init__()
        self.__dict__.update(model_dict) #unpack model dict from CFG into this class
        
        if self.transfer:
            model = self.architecture
            #modify the last layer
            num_ftrs = model.fc.in_features
            model.fc = nn.Linear(num_ftrs, output_features)
            #modify the input size to fit the grayscale images!
            model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
            self.model = model
        else:
#             self.model = self.architecture
            pass
        self.output_features = output_features
        #optimizer
        self.optimizer = self.optimizer(self.model.parameters(),lr = self.lr, weight_decay = self.weight_decay,amsgrad=False)
        #path where to save model
        self.save_path = 'models'
        
    def forward(self, x):
        return self.model(x)

    def fit(self,
            train_loader,
            val_loader,
            epochs = 5,
            batch_size = 32,
            device = 'cpu'):
        '''
        =============================
            OPTIMIZATION LOOP
        =============================

        Args:
            train_loader(torch dataloader)
            val_loader(torch dataloader)
            epochs(int)
            batch_size(int)
            device(str)

            
        Output style inspired by skorch fit() method
        https://skorch.readthedocs.io/en/stable/net.html?highlight=fit#skorch.net.NeuralNet.fit

        '''
        #may be changed if lrscheduler is used???
        lr = deepcopy(self.lr)

        #get model training history
        history = self.history
        if history == None:
            history = defaultdict(list)
        else:
            pass
        #get train and val sizes
        train_size = len(train_loader.dataset)
        val_size = len(val_loader.dataset)
        #stuff for printing epoch metrics as a beautiful table
        headers = ['epoch','train_loss','val_loss','val_acc','cp','lr','dur']
        template = '{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}'
        print(template.format(*headers))
        print(template.replace(':', ':-').format('','','','','','',''))
        cyan = "\033[96m{:<10}\033[00m" #cyan
        purple = "\033[95m{:<10}\033[00m" #purple
        green = "\033[92m{:<10}\033[00m" #green
        white = "\033[0m{:<10}\033[0m" #white
        #set model into train mode
        self.model.train()
        #send model to device
        self.model.to(device)
        #training loop
        for epoch in range(epochs):
            start_time = time.time()
            train_loss = 0
            val_loss, val_acc = 0, 0
            #optimization  loop
            time.sleep(.2)
            for X,y in tqdm(train_loader, desc ="Train batches"):
                #Send training data to device
                X,y = X.to(device), y.to(device)
                #Forward propagation
                pred = self.model(X)
                loss = self.criterion(pred,y)
                #Backpropagation
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                #update loss
                train_loss = loss.item()
                
            #validation loop
            with torch.no_grad():
                for X, y in tqdm(val_loader, desc='Validation Batches'):
                    X,y = X.to(device),y.to(device)
                    pred = self.model(X)
                    val_loss = self.criterion(pred,y).item()
                    val_acc += (pred.argmax(1) == y).type(torch.float).sum().item()
            #calculate validation accuracy after the epoch
            val_acc /= val_size
            #append epoch results
            history['epoch'].append(epoch+1)
            history['train_loss'].append(train_loss)
            history['val_loss'].append(val_loss)
            history['val_acc'].append(val_acc)
            
            #colorize epoch's output if it improves
            colortemp = template.split(' ')
            # colorize train loss if it decreases
            if history['train_loss'][-1] == min(history['train_loss']):
                colortemp[1] = cyan
            else:
                colortemp[1] = white
            #colorize validation loss if it decreases
            if history['val_loss'][-1] == min(history['val_loss']):
                colortemp[2] = purple
            else:
                colortemp[2] = white
            # colorize validation accuracy & save best weights if it increases
            if history['val_acc'][-1] == max(history['val_acc']):
                #colorize       
                colortemp[3] = green
                #checkpoint
                cp = '+'
                if not os.path.exists(self.save_path):
                    os.mkdir(self.save_path)
                torch.save(self.model.state_dict(), Path(self.save_path,f'best_{self.name}.pth'))
            else:
                colortemp[3] = white
                cp = '-'
            colortemp = ' '.join(colortemp)

            #calculate epoch duration (in seconds)
            end_time = time.time()
            dur = end_time - start_time
            #append the rest of epoch results
            history['cp'].append(cp)
            history['lr'].append(lr)
            history['dur'].append(dur)
            #display the epoch results
            print(colortemp.format(*f'{epoch+1}/{epochs} {train_loss:.4f} {val_loss:.4f} {val_acc:.2f} {cp} {lr} {dur:.2f}'.split(' ')))
        #update epoch number of the entire training history
        history['epoch'] = [e+1 for e in range(len(history['epoch']))]
        #update model's training history
        self.history = history
        #save training history as csv
        self.save_history()
        
    def predict(self,dataloader,device ='cpu'):
        '''
        ===============
           Predict
        ===============
        '''
        #set model to evaluation mode
        self.model.eval()
        #model to device, default cpu
        self.model.to(device)

        preds = []
        with torch.no_grad():
            for X, y in tqdm(dataloader):
                X,y = X.to(device),y.to(device)
                pred = self.model(X)
                pred = pred.argmax(1)
                preds.append(pred)
        return preds
    
    def eval_model(self,dataloader,avg=None,device ='cpu'):
        '''
        ==================================
           ACCURACY PRECISION RECALL F1
        ==================================
        '''
        labels = [l for l in range(self.output_features)]
        loader_size = len(dataloader)
        dataset_size = len(dataloader.dataset)

        acc = 0
        precision = 0
        recall = 0
        f1 = 0
        roc = 0

        #set model to evaluation mode
        self.model.eval()
        #model to device, default cpu
        self.model.to(device)

        with torch.no_grad():
            for X, y in tqdm(dataloader, desc = 'Evaluating the model'):
                X,y = X.to(device),y.to(device)
                pred = self.model(X)
                #accuracy
                acc += (pred.argmax(1) == y).type(torch.float).sum().item()
                pred = pred.argmax(1)
                
                # to int 
                pred,y = list(pred), list(y)
                pred = [int(p) for p in pred]
                y = [int(p) for p in y]

                #precision
                p = precision_score(y, pred, labels = labels, zero_division = 1, average = avg)
                precision+=p
                #recall
                r = recall_score(y, pred, labels = labels, zero_division = 1,  average = avg)
                recall+=r
                #f1 score
                f = f1_score(y, pred, labels = labels, zero_division = 1,  average = avg)
                f1 += f
                #roc
                roc += roc_auc_score(y, pred)

        acc /= dataset_size
        precision /= loader_size
        recall /= loader_size
        f1 /= loader_size
        roc /= loader_size
            
            
        print(f" Accuracy: {(100*acc):>0.1f}%")
        print(f"Precision: {(100*np.mean(precision)):>0.1f}%")
        print(f"   Recall: {(100*np.mean(recall)):>0.1f}%")
        print(f" F1 Score: {(100*np.mean(f1)):>0.1f}%")
        print(f"      ROC: {(100*np.mean(f1)):>0.1f}%")

        
    def plot_loss_history(self):
        '''
        Plot loss history
        '''
        assert self.history != None, 'No history to plot -> the model has not been trained yet!'
        
        df = pd.DataFrame(self.history)
        fig = px.line(x = df.epoch,
                    y = [df.train_loss, df.val_loss],
                    title = 'Loss History',
                    labels={'x':'epoch','value': 'loss', 'variable': 'loss'})
        fig.data[0].name = 'train'
        fig.data[1].name = 'val'
        fig.show()

    def save_history(self):
        '''Save model's training history'''
        assert self.history != None, 'No history to save -> the model has not been trained yet!'
        #save as csv
        pd.DataFrame(self.history).to_csv(Path(f'models/{self.name}_history.csv')) 
        #save as pickle file
        with open(Path(f'models/{self.name}_history.pkl'), 'wb') as f:
            pickle.dump(self.history, f, protocol=pickle.HIGHEST_PROTOCOL)

    def save_model(self):
        torch.save(self.model.state_dict(), Path(self.save_path,f'latest_{self.name}.pth'))

    def load_model(self,path = 'models', device = 'cpu'):
        '''Load model'''
        try:
            #load model weights
            p = Path(path,f'best_{self.name}.pth')
            self.model.load_state_dict(torch.load(p, map_location=torch.device(device)))
            #load model training history
            with open(Path(path,f'{self.name}_history.pkl'), 'rb') as h: 
                self.history = pickle.load(h)
        except:
            print('No model to load!')

## Train

In [None]:
#instantiate model and send to device
Resnet18 = Net(CFG.MODEL1,CFG.OUTPUT_FEATURES)
Resnet18.load_model()

In [None]:
Resnet18.fit(TRAIN_LOADER,
             VAL_LOADER,
             CFG.EPOCHS,
             CFG.BATCH_SIZE,
             CFG.DEVICE)

In [None]:
Resnet18.plot_loss_history()

## Inference

In [None]:
#evaluate best model's performance on the VAL set
print('Best Model:')
print('-'*20)
time.sleep(0.1)
best_model = deepcopy(Resnet18)
best_model.load_model(device = CFG.DEVICE)
best_model.eval_model(VAL_LOADER, avg = 'binary',device = CFG.DEVICE)
#evaluate current model's performance on the VAL set
print()
print('Current Model:')
time.sleep(0.1)
print('-'*20)
Resnet18.eval_model(VAL_LOADER,avg = 'binary', device = CFG.DEVICE)

In [None]:
'''
    Get predictions on the TEST set
'''

TEST_PREDS = Resnet18.predict(TEST_LOADER, CFG.DEVICE)
preds = [int(i) for p in TEST_PREDS  for i in list(p)]
sub.target = preds
sub.to_csv('submission.csv', index=False)

In [None]:
sub.head()

In [None]:
TEST_PREDS

# Good Luck!