In [1]:
import os
import pandas as pd

In [2]:
!pip install -q kaggle

In [None]:
!mkdir -p ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [14]:
# Download data
!kaggle datasets download -d deepu1109/star-dataset

Dataset URL: https://www.kaggle.com/datasets/deepu1109/star-dataset
License(s): copyright-authors
Downloading star-dataset.zip to /content
  0% 0.00/3.16k [00:00<?, ?B/s]
100% 3.16k/3.16k [00:00<00:00, 6.64MB/s]


In [15]:
# Unzip and prep dataset
!unzip star-dataset.zip

Archive:  star-dataset.zip
  inflating: 6 class csv.csv         


In [16]:
df = pd.read_csv("/content/6 class csv.csv")

In [17]:
df.head(3)

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,3068,0.0024,0.17,16.12,0,Red,M
1,3042,0.0005,0.1542,16.6,0,Red,M
2,2600,0.0003,0.102,18.7,0,Red,M


In [200]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Preprocess and prepare data
class StarDataset(Dataset):
    def __init__(self, split="train"):
      df = pd.read_csv("/content/6 class csv.csv").sample(frac=1, random_state=1999) # Shuffle required - class labels in order
      # One hot encode
      df = pd.get_dummies(df, columns=['Star color', 'Spectral Class'])
      # Extract features and target vars
      labels = df[['Star type']]
      df = pd.get_dummies(df, columns=['Star type']) # Encode target var
      labels = df.iloc[:, -6:].astype(int).values
      features = df.iloc[:, :-6].astype(float).values

      # Split into train-test-val sets
      X_train, X_temp, y_train, y_temp = train_test_split(features, labels, test_size=0.3, random_state=1999)
      X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1999)

      # Standardize features
      scaler = StandardScaler()
      X_train_scaled = scaler.fit_transform(X_train)
      X_val_scaled = scaler.transform(X_val)
      X_test_scaled = scaler.transform(X_test)

      # Assign data splits
      self.train = (X_train_scaled, y_train)
      self.val = (X_val_scaled, y_val)
      self.test = (X_test_scaled, y_test)

      # Set features and labels based on the specified split
      if split == "train":
        self.features, self.labels = self.train
      elif split == "val":
        self.features, self.labels = self.val
      elif split == "test":
        self.features, self.labels = self.test
      else:
        raise ValueError("Split type invalid.")

    def __len__(self):
      # Return the length of the dataset
      return len(self.features)

    def __getitem__(self, idx):
      # Return features and labels for a given index
      features = torch.tensor(self.features[idx], dtype=torch.float32)
      labels = torch.tensor(self.labels[idx], dtype=torch.float32)
      return features, labels


In [201]:
# Create datasets
train_dataset = StarDataset()
test_dataset = StarDataset(split="test")
val_dataset = StarDataset(split='val')

In [202]:
# Len check
train_dataset.__len__(), test_dataset.__len__(), val_dataset.__len__()

(168, 36, 36)

In [203]:
train_dataset[0]

(tensor([ 0.0470,  0.3419,  3.1139, -1.4434,  1.5520, -0.5498, -0.0774, -0.2085,
         -0.1098, -0.0774, -0.0774,  2.9803, -0.1098, -0.0774, -0.0774, -0.9535,
         -0.1348, -0.0774,  0.0000, -0.0774, -0.0774, -0.1348, -0.1751, -0.1098,
         -0.2516]),
 tensor([1., 0., 0., 0., 0., 0.]))

In [204]:
# Validate dataset formation
for ele in range(160,train_dataset.__len__()):
  print(train_dataset[ele])

(tensor([ 1.1760, -0.5317, -0.4150, -0.7352,  0.3614, -0.5498, -0.0774, -0.2085,
        -0.1098, -0.0774, -0.0774,  2.9803, -0.1098, -0.0774, -0.0774, -0.9535,
        -0.1348, -0.0774,  0.0000, -0.0774, -0.0774, -0.1348, -0.1751, -0.1098,
        -0.2516]), tensor([1., 0., 0., 0., 0., 0.]))
(tensor([-0.7583, -0.5678, -0.4280,  1.1410, -1.4245, -0.5498, -0.0774, -0.2085,
        -0.1098, -0.0774, -0.0774, -0.3355, -0.1098, -0.0774, -0.0774,  1.0488,
        -0.1348, -0.0774,  0.0000, -0.0774, -0.0774, -0.1348, -0.1751, -0.1098,
        -0.2516]), tensor([0., 0., 0., 0., 1., 0.]))
(tensor([-0.7289, -0.5678, -0.4278,  1.1429, -1.4245, -0.5498, -0.0774, -0.2085,
        -0.1098, -0.0774, -0.0774, -0.3355, -0.1098, -0.0774, -0.0774,  1.0488,
        -0.1348, -0.0774,  0.0000, -0.0774, -0.0774, -0.1348, -0.1751, -0.1098,
        -0.2516]), tensor([0., 0., 0., 0., 1., 0.]))
(tensor([-0.7196, -0.5678, -0.4276,  0.7083, -0.8292, -0.5498, -0.0774, -0.2085,
        -0.1098, -0.0774, -0.0774, -0

In [205]:
# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, drop_last=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, drop_last=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, drop_last=True, num_workers=2)

In [206]:
# Validate dataloader formation
for i, j in enumerate(train_loader):
  print(i, j)
  break

0 [tensor([[-0.7635, -0.5678, -0.4272,  0.5580, -0.8292, -0.5498, -0.0774, -0.2085,
         -0.1098, -0.0774, -0.0774, -0.3355, -0.1098, -0.0774, -0.0774,  1.0488,
         -0.1348, -0.0774,  0.0000, -0.0774, -0.0774, -0.1348, -0.1751, -0.1098,
         -0.2516],
        [ 0.7564, -0.5678, -0.4282,  0.6341, -0.2339,  1.8187, -0.0774, -0.2085,
         -0.1098, -0.0774, -0.0774, -0.3355, -0.1098, -0.0774, -0.0774, -0.9535,
         -0.1348, -0.0774,  0.0000, -0.0774, -0.0774, -0.1348, -0.1751, -0.1098,
         -0.2516],
        [-0.7509, -0.5678, -0.4280,  1.1814, -1.4245, -0.5498, -0.0774, -0.2085,
         -0.1098, -0.0774, -0.0774, -0.3355, -0.1098, -0.0774, -0.0774,  1.0488,
         -0.1348, -0.0774,  0.0000, -0.0774, -0.0774, -0.1348, -0.1751, -0.1098,
         -0.2516],
        [-0.7600, -0.5678, -0.4275,  0.6466, -0.8292, -0.5498, -0.0774, -0.2085,
         -0.1098, -0.0774, -0.0774, -0.3355, -0.1098, -0.0774, -0.0774,  1.0488,
         -0.1348, -0.0774,  0.0000, -0.0774, -0.0

In [207]:
# cnt = 0
# for epc in [1, 2, 3]:
#   for features, label in train_loader:
#     if cnt ==0:
#       print(features, label)
#       cnt += 1
#     print(features.size(), label.size())


In [214]:
# Custom model adapted to our Star classification use case
from torch import nn
from torchvision.models import resnet18, ResNet18_Weights

class StarClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(StarClassifier, self).__init__()
        # Define layers
        self.fc1 = nn.Linear(input_dim, hidden_dim) #
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.out_layer = nn.Linear(hidden_dim, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        # Forward pass
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.out_layer(x)
        return x

In [215]:
# Init custom StarClassifier model
num_classes = 6
input_dim = train_dataset.features.shape[1]
hidden_dim = 64
model = StarClassifier(input_dim, hidden_dim, num_classes)

In [216]:
import torch.optim as optim

# Define training func
def train(model, train_loader, val_loader, epochs=15):
    criterion = nn.CrossEntropyLoss() # Define loss
    optimizer = optim.Adam(model.parameters(), lr=0.001) # Init optimizer

    for epoch in range(epochs):
      # Training
        model.train() # Model set to train mode
        total_loss = 0 # Total loss for this epoch

        # Iterate over batches in dataloader
        for features, labels in train_loader:
            optimizer.zero_grad() # Zero out grads
            outputs = model(features) # Fwd pass, comp model out
            loss = criterion(outputs, labels) # Comp loss
            loss.backward() # Backprop to comp grads
            optimizer.step() # Update model params
            total_loss += loss.item() # Accumulate batch loss

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")

        # Validation
        model.eval() # Model set to eval mode
        total_val_loss = 0 # Total val loss
        correct = 0 # #of correct preds
        total = 0 # Total # preds made

        # Gradients are not comp during val/eval phase
        with torch.no_grad():
            # Iterate over val_loader batches
            for features, labels in val_loader:
                outputs = model(features) # Fwd pass - comp model outs
                loss = criterion(outputs, labels) # Comp val loss
                total_val_loss += loss.item() # Accumulate val loss
                _, predicted = torch.max(outputs.data, 1) # Get pred class
                total += labels.size(0)
                correct += (predicted == torch.argmax(labels, dim=1)).sum().item() # Cnt Correct preds

        print(f"Validation Loss: {total_val_loss/len(val_loader)}")
        print(f"Validation Accuracy: {100 * correct / total}%")

# Train the model
train(model, train_loader, val_loader, epochs=15)

Epoch 1/15, Loss: 1.5143501815341769
Validation Loss: 1.2088967561721802
Validation Accuracy: 88.88888888888889%
Epoch 2/15, Loss: 0.9226263981489908
Validation Loss: 0.5116879277759128
Validation Accuracy: 97.22222222222223%
Epoch 3/15, Loss: 0.43414605213772683
Validation Loss: 0.2021969167722596
Validation Accuracy: 97.22222222222223%
Epoch 4/15, Loss: 0.2369448540820962
Validation Loss: 0.13748991986115774
Validation Accuracy: 97.22222222222223%
Epoch 5/15, Loss: 0.16180765768513083
Validation Loss: 0.08527907191051377
Validation Accuracy: 97.22222222222223%
Epoch 6/15, Loss: 0.12790452613539638
Validation Loss: 0.0640392622186078
Validation Accuracy: 97.22222222222223%
Epoch 7/15, Loss: 0.11642233439765516
Validation Loss: 0.04256307354403867
Validation Accuracy: 97.22222222222223%
Epoch 8/15, Loss: 0.1161257500172637
Validation Loss: 0.07079499007927047
Validation Accuracy: 94.44444444444444%
Epoch 9/15, Loss: 0.10875373038773735
Validation Loss: 0.05386512250536018
Validation Ac

In [217]:
# Eval Phase
def evaluate(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for features, labels in test_loader:
            outputs = model(features)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == torch.argmax(labels, dim=1)).sum().item()

    print(f"Test Accuracy: {100 * correct / total}%")

evaluate(model, test_loader)

Test Accuracy: 91.66666666666667%
