<a href="https://colab.research.google.com/github/indhu68/Intro_to_ML_project/blob/main/FNN_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
import torch.optim as optim
torch.set_printoptions(edgeitems=2, linewidth=75)
torch.manual_seed(123)

<torch._C.Generator at 0x7bcce0f235d0>

In [2]:
from sklearn.metrics import accuracy_score, classification_report

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [4]:
# Load the data from the URL
data_url = "https://raw.githubusercontent.com/indhu68/Intro_to_ML_project/main/train_dataframe.csv"
data = pd.read_csv(data_url)

# Remove the 'label' column and store it separately
removed_column = data.pop('label')  # Assuming 'label' is the name of the column to be removed
Y = pd.DataFrame({'age group': removed_column})

# Remaining data
X = data.values



In [5]:
sc = StandardScaler()
X = sc.fit_transform(X)
X.shape

(30694, 23)

In [6]:
label_mapping = {label: idx for idx, label in enumerate(np.unique(Y['age group']))}
Y = np.array([label_mapping[label] for label in Y['age group']])
X = torch.tensor(X, dtype=torch.float32)
Y.shape

(30694,)

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Convert NumPy arrays to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.LongTensor(y_train)
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.LongTensor(y_test)







In [12]:

import torch.nn as nn
import torch.optim as optim
import numpy as np
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)  # Using hidden_size here too
        self.fc3 = nn.Linear(hidden_size, hidden_size)  # And here
        self.fc4 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        # Dropout layer to prevent overfitting
        self.dropout = nn.Dropout(p=0.2)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)
        return x




In [13]:
from sklearn.utils.class_weight import compute_class_weight

def compute_class_weights(y):
    class_weights = compute_class_weight(  class_weight = 'balanced', classes = np.unique(y),y =  y)
    return class_weights

# Calculate class weights based on class distribution
class_weights = compute_class_weights(y_train)

# Convert class weights to a PyTorch tensor
class_weights_tensor =  torch.FloatTensor(class_weights).to(device)

# Use weighted cross-entropy loss
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)


In [14]:
input_size = X_train.shape[1]
hidden_size = 128  # You can adjust this based on your problem
output_size = len(np.unique(y_train))
# Now, when instantiating the model, it should match:
model = SimpleNN(input_size, hidden_size, output_size).to(device)
X_train_tensor = X_train_tensor.to(device)  # Move training data to GPU
y_train_tensor = y_train_tensor.to(device)
X_test_tensor = X_test_tensor.to(device)  # Move test data to GPU
y_test_tensor = y_test_tensor.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [15]:
num_epochs = 500
batch_size = 32
for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(X_train_tensor), batch_size):
        inputs = X_train_tensor[i:i+batch_size]
        labels = y_train_tensor[i:i+batch_size]

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 100 == 0:
        model.eval()
        with torch.no_grad():
            train_outputs = model(X_train_tensor)
            val_outputs = model(X_test_tensor)
            _, train_predicted = torch.max(train_outputs, 1)
            train_accuracy = accuracy_score(y_train, train_predicted.cpu().numpy())
            _, test_predicted = torch.max(val_outputs, 1)
            test_accuracy = accuracy_score(y_test, test_predicted.cpu().numpy())  # Move back to CPU for metrics
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Training Accuracy: {train_accuracy * 100:.2f}%, Validation Accuracy: {test_accuracy * 100:.2f}%')
        model.train()

Epoch [100/500], Loss: 1.2152, Training Accuracy: 65.61%, Validation Accuracy: 59.94%
Epoch [200/500], Loss: 0.7394, Training Accuracy: 74.33%, Validation Accuracy: 66.64%
Epoch [300/500], Loss: 0.5854, Training Accuracy: 77.65%, Validation Accuracy: 68.61%
Epoch [400/500], Loss: 0.5027, Training Accuracy: 79.44%, Validation Accuracy: 69.64%
Epoch [500/500], Loss: 0.6261, Training Accuracy: 80.89%, Validation Accuracy: 69.70%


In [16]:
# Load the data from the URL
data_url = "https://raw.githubusercontent.com/indhu68/Intro_to_ML_project/main/test_dataframe.csv"
data = pd.read_csv(data_url)

# Remove the 'label' column and store it separately
removed_column = data.pop('label')  # Assuming 'label' is the name of the column to be removed
Y = pd.DataFrame({'age group': removed_column})

# Remaining data
X = data.values
sc = StandardScaler()
X = sc.fit_transform(X)
label_mapping = {label: idx for idx, label in enumerate(np.unique(Y['age group']))}
y_test = np.array([label_mapping[label] for label in Y['age group']])
X_test= torch.tensor(X, dtype=torch.float32)
X_test_tensor = torch.FloatTensor(X_test)
X_test_tensor = X_test_tensor.to(device)

In [18]:
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(y_test, predicted.cpu().numpy())  # Move back to CPU for metrics
    print(f'Accuracy on the test set: {accuracy * 100:.2f}%')

# Convert predicted labels back to original age group strings
predicted_labels = [label for label, idx in label_mapping.items() if idx in y_test]
print(predicted.cpu().numpy())
print(classification_report(y_test, predicted.cpu().numpy(), target_names=predicted_labels))

Accuracy on the test set: 70.10%
[7 6 6 ... 1 6 7]
              precision    recall  f1-score   support

    eighties       0.57      0.80      0.67         5
     fifties       0.74      0.73      0.74       205
    fourties       0.67      0.72      0.70       236
   seventies       0.63      0.92      0.75        36
     sixties       0.78      0.88      0.82        88
       teens       0.61      0.71      0.65       117
    thirties       0.71      0.65      0.68       389
    twenties       0.72      0.66      0.69       466

    accuracy                           0.70      1542
   macro avg       0.68      0.76      0.71      1542
weighted avg       0.70      0.70      0.70      1542

