# Notebook Setup

We'll install and import the needed libraries here.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Dataset

According to the Kaggle Dataset Page:

> This radar data was collected by a system in Goose Bay, Labrador. This system consists of a phased array of 16 high-frequency antennas with a total transmitted power on the order of 6.4 kilowatts. See the paper for more details. The targets were free electrons in the ionosphere. "Good" radar returns are those showing evidence of some type of structure in the ionosphere. "Bad" returns are those that do not; their signals pass through the ionosphere.
> 
> Received signals were processed using an autocorrelation function whose arguments are the time of a pulse and the pulse number. There were 17 pulse numbers for the Goose Bay system. Instances in this databse are described by 2 attributes per pulse number, corresponding to the complex values returned by the function resulting from the complex electromagnetic signal.

In [None]:
df = pd.read_csv('/kaggle/input/ionosphere/ionosphere_data.csv')
df

# Preprocessing

## Class Distribution

The dataset has a class imbalance proble, but it's not that severe.

In [None]:
sns.countplot(x='column_ai', data=df)

## Removing the Useless Features

The second feature, `column_b`, has no variance and therefore isn't useful to the model.

In [None]:
df.drop(columns=['column_b'], inplace=True)

## Encoding the Output Classes

PyTorch takes in integer class indices in its loss functions.

In [None]:
df.rename(columns={'column_ai': 'label'}, inplace=True)
df['label'] = df.label.astype('category')
encoding = {'g': 1, 'b': 0}
df.label.replace(encoding, inplace=True)
df

## Converting the Remaining Boolean Feature Into Numeric

In [None]:
df['column_a'] = df.column_a.astype('float64')

## Preparing the Dataset for Training and Validation

In [None]:
X = df.values[:, :-1]
y = df.values[:, -1]

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=7)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# The boolean feature does not need to be normalized.
x_train[:, 1:] = scaler.fit_transform(x_train[:, 1:])
x_test[:, 1:] = scaler.transform(x_test[:, 1:])

# Toying with the Dataset

Let's visualize the data and see if we can find anything interesting! We'll first use **t-SNE** to reduce the dimensions of the data while preserving the relative distance of the vectors.

In [None]:
from sklearn.manifold import TSNE

x_embedded = TSNE(n_components=2).fit_transform(x_train)
plt.scatter(x_embedded[:, 0], x_embedded[:, 1], color=['green' if label else 'red' for label in y_train])
plt.show()

The dataset doesn't seem to be that complicated. Let's now see what it would look like when processed by the PCA algorithm:

In [None]:
from sklearn.decomposition import PCA

x_embedded = PCA(n_components=2).fit_transform(x_train)
plt.scatter(x_embedded[:, 0], x_embedded[:, 1], color=['green' if label else 'red' for label in y_train])
plt.show()

With the right transformation, the dataset can even be linearly separated! We can expect a high accuracy of even a simple model.

# Modeling

We need to first define a few other modules before we can get to the network.

## Configurations

In [None]:
iterations = 100
batch_size = 32

## Data Pipeline

In [None]:
from torch.utils.data import Dataset


class TrainData(Dataset):
    
    def __init__(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train
    
    def __getitem__(self, index):
        return self.x_train[index], self.y_train[index]
    
    def __len__ (self):
        return len(self.x_train)

    
class TestData(Dataset):
    
    def __init__(self, x_test):
        self.x_test = x_test
        
    def __getitem__(self, index):
        return self.x_test[index]
        
    def __len__ (self):
        return len(self.x_test)

In [None]:
train_data = TrainData(torch.from_numpy(x_train).to(torch.float32), torch.from_numpy(y_train).to(torch.float32))
test_data = TestData(torch.from_numpy(x_test).to(torch.float32))

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

## Model Definition

We'll be using a **DropOut** layer in the model to ensure that the network does not overfit on the training set.

In [None]:
class Network(nn.Module):
    
    def __init__(self):
        super().__init__()
        # Number of input features is 33.
        self.linear_1 = nn.Linear(33, 64)
        self.linear_2 = nn.Linear(64, 64)
        self.linear_3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(p=0.1)
    
    def forward(self, inputs):
        out = self.linear_1(inputs)
        out = F.relu(out)
        out = self.linear_2(out)
        out = F.relu(out)
        out = self.dropout(out)
        out = self.linear_3(out)
        # Sigmoid activation is later applied by the loss function for numerical stability.
        return out

## Training

We'll use the **Area Under Curve** metric for evaluating the model during the training. Using the Accuracy metric is not a good idea, considering the label imbalance present in the dataset.

In [None]:
network = Network()
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(network.parameters(), lr=1e-4)

In [None]:
from sklearn.metrics import roc_auc_score

n_batches = len(train_loader)

network.train()

loss_li = []
score_li = []

for it in range(iterations):
    it_loss = 0
    it_score = 0
    for x_batch, y_batch in train_loader:
        optimizer.zero_grad()
        y_real = y_batch.unsqueeze(1)
        y_pred = network(x_batch)
        loss = criterion(y_pred, y_real)
        y_pred = torch.sigmoid(y_pred.detach())
        score = roc_auc_score(y_real, y_pred)
        loss.backward()
        optimizer.step()
        it_loss += loss.item()
        it_score += score
    loss_li.append(it_loss / n_batches)
    score_li.append(it_score / n_batches)
    print('[Iteration {}] Loss: {:.4f}, Area-Under-Curve: {:.4f}'.format(it, it_loss / n_batches, it_score / n_batches))
        

In [None]:
plt.plot(loss_li)
plt.xlabel('Iteration')
plt.ylabel('Binary Cross-Entropy Loss')
plt.show()

In [None]:
plt.plot(score_li)
plt.xlabel('Iteration')
plt.ylabel('Area Under Curve')
plt.show()

## Evaluation

We'll compute and analyze the **Confusion Matrix** of the model using `test_data`. Then, we'll compute the **Precision**, **Recall**, and **F1 Score** of each class.

In [None]:
network.eval()

predictions = []

with torch.no_grad():
    for x_batch in test_loader:
        y_pred = network(x_batch)
        y_pred = torch.sigmoid(y_pred)
        predictions.append(y_pred.squeeze().tolist())

y_pred = np.round(predictions)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.metrics import classification_report

classification_report(y_test, y_pred, output_dict=True)

Quite the performance! Looks like we didn't overfit after all :)