In [1]:
import numpy as np
import pandas as pd

In [60]:
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [61]:
# Separate the target variable (first column)
y_train = df.iloc[:, 0]  # This selects the first column
# Create the feature set by dropping the first column
x_train = df.drop(df.columns[0], axis=1)
y_one_hot = np.zeros((y_train.size,10 ))
y_one_hot[np.arange(y_train.size), y_train] = 1
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
def sigmoid_derivative(x):
    return x * (1 - x)
x_train = sigmoid(x_train) 

In [4]:
print(y_train.head())
print(x_train.head())
print(x_train.shape)

0    1
1    0
2    1
3    4
4    0
Name: label, dtype: int64
   pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  \
0     0.5     0.5     0.5     0.5     0.5     0.5     0.5     0.5     0.5   
1     0.5     0.5     0.5     0.5     0.5     0.5     0.5     0.5     0.5   
2     0.5     0.5     0.5     0.5     0.5     0.5     0.5     0.5     0.5   
3     0.5     0.5     0.5     0.5     0.5     0.5     0.5     0.5     0.5   
4     0.5     0.5     0.5     0.5     0.5     0.5     0.5     0.5     0.5   

   pixel9  ...  pixel774  pixel775  pixel776  pixel777  pixel778  pixel779  \
0     0.5  ...       0.5       0.5       0.5       0.5       0.5       0.5   
1     0.5  ...       0.5       0.5       0.5       0.5       0.5       0.5   
2     0.5  ...       0.5       0.5       0.5       0.5       0.5       0.5   
3     0.5  ...       0.5       0.5       0.5       0.5       0.5       0.5   
4     0.5  ...       0.5       0.5       0.5       0.5       0.5       0.5   

   pixe

In [5]:
input_size = x_train.shape[1]  # Number of features in X_train
input_size

784

In [6]:
output_layer = np.zeros(10)
output_layer

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [7]:
class Linear:
    def __init__(self, input_size , output_size):
        self.input_size = input_size
        self.output_size = output_size
        self.input = None  # Store the input data here
        # Initialize weights and biases
        self.weights = np.random.randn(input_size, output_size) * 0.01
        self.biases = np.zeros((1, output_size))
        self.dL_dW = None
        self.dL_db = None
    
    # Example forward method
    def forward(self, input):
        self.input = input
        # Compute the forward pass (just an example, actual implementation may vary)
        return np.dot(input, self.weights) + self.biases
    

    def backward(self, dL_dy):
        # dL_dy: Gradient of the loss with respect to the output of this layer
        # Returns dL_dx: Gradient of the loss with respect to the input of this layer
        
        # Correct calculation of gradients
        dL_dx = np.dot(dL_dy, self.weights.T)  # Corrected to match dimensions
        self.dL_dW = np.dot(self.input.T, dL_dy)
        self.dL_db = np.sum(dL_dy, axis=0, keepdims=True)
        
        # Update weights and biases here or return gradients for external update
        # For example:
        # self.W -= learning_rate * dL_dW
        # self.b -= learning_rate * dL_db
        
        return dL_dx

# Assuming ReLU, Softmax, and other components are correctly implemented

In [8]:
class ReLU:
    def forward(self, X):
        self.mask = (X <= 0)
        out = np.maximum(0, X)
        return out

    def backward(self, dL_dy):
        dX = np.zeros_like(dL_dy)  
        
        # Only propagate gradients for inputs that were greater than zero
        dX[~self.mask] = dL_dy[~self.mask]  
        # Set gradients where input was positive
        
        return dX

class Sigmoid:
    def forward(self, X):
        self.out = 1 / (1 + np.exp(-X))
        return self.out

    def backward(self, dL_dy):
        return dL_dy * self.out * (1 - self.out)

class Tanh:
    def forward(self, X):
        self.out = np.tanh(X)
        return self.out

    def backward(self, dL_dy):
        return dL_dy * (1 - self.out**2)

class Softmax:
    def __init__(self):
        self.out = None  # This will hold the output of the forward pass

    def forward(self, X):
        """
        Applies the softmax function to the input.

        Args:
            X (numpy.ndarray): Input data of shape (batch_size, num_classes).

        Returns:
            numpy.ndarray: Softmax probabilities of shape (batch_size, num_classes).
        """
        exp_X = np.exp(X - np.max(X, axis=1, keepdims=True))  # For numerical stability
        self.out = exp_X / np.sum(exp_X, axis=1, keepdims=True)
        return self.out

    def backward(self, dL_dy):
        """
        Computes the gradient of the loss with respect to the input of the softmax layer.

        Args:
            dL_dy (numpy.ndarray): Gradient of the loss with respect to the output of the softmax layer.

        Returns:
            numpy.ndarray: Gradient of the loss with respect to the input of the softmax layer.
        """
        # dL_dy is of shape (batch_size, num_classes)
        batch_size = dL_dy.shape[0]
        num_classes = dL_dy.shape[1]

        # Create a gradient output with the same shape as dL_dy
        dL_dx = np.zeros((batch_size, num_classes))

        for i in range(batch_size):
            # For each sample, compute the Jacobian of the softmax function
            s = self.out[i]  # Softmax output for the current sample
            jacobian = np.diag(s) - np.outer(s, s)  # Jacobian matrix of the softmax function
            dL_dx[i] = np.dot(jacobian, dL_dy[i])  # Apply the chain rule

        return dL_dx  # Shape will be (batch_size, num_classes)



In [9]:
class MSELoss:
    def forward(self, y_true, y_pred):

        # Calculate the mean squared error
        self.loss = np.mean((y_true - y_pred) ** 2)
        return self.loss

    def backward(self, y_true, y_pred):

        # Number of samples
        m = y_true.shape[0]
        
        # Calculate the gradient (derivative of the loss)
        dL_dy_pred = (2 * (y_pred - y_true)) / m
        return dL_dy_pred


In [10]:
class CrossEntropyLoss:
    def forward(self, y_one_hot, y_pred):
        """
        Computes the Cross-Entropy loss.

        Args:
            y_one_hot (numpy.ndarray): True labels (one-hot encoded).
            y_pred (numpy.ndarray): Predicted probabilities from the model.

        Returns:
            float: The computed cross-entropy loss.
        """
        # Ensure numerical stability with a small constant
        epsilon = 1e-15
        # Clip predictions to avoid log(0)
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        
        # Calculate cross-entropy loss
        loss = -np.mean(np.sum(y_one_hot * np.log(y_pred), axis=1))
        return loss

    def backward(self, y_one_hot, y_pred):
        """
        Computes the gradient of the Cross-Entropy loss with respect to the predicted probabilities.

        Args:
            y_one_hot (numpy.ndarray): True labels (one-hot encoded).
            y_pred (numpy.ndarray): Predicted probabilities from the model.

        Returns:
            numpy.ndarray: Gradient of the loss with respect to the predicted probabilities.
        """
        # Number of samples
        m = y_one_hot.shape[0]
        
        # Gradient of the loss with respect to predicted probabilities
        dL_dy_pred = - (y_one_hot / y_pred) / m
        return dL_dy_pred

        

In [11]:
class SGD:
    def __init__(self, learning_rate=0.01):
        """
        Initializes the SGD optimizer.

        Args:
            learning_rate (float): The learning rate for the optimizer.
        """
        self.learning_rate = learning_rate

    def step(self, model):
        """
        Updates the model parameters using the gradients stored in the model.

        Args:
            model (Model): The model containing parameters and gradients.
        """
        for layer in model.layers:
            if hasattr(layer, 'weights'):
                layer.weights -= self.learning_rate * layer.dL_dW
                layer.biases -= self.learning_rate * layer.dL_db


In [12]:
class Model:
    def __init__(self):
        self.layers = []
        self.loss_fn = None
        self.optimizer = None
        
    def add_layer(self, layer):
        """
        Adds a layer to the model.

        Args:
            layer: An instance of a layer class (e.g., Linear, ReLU, Softmax).
        """
        self.layers.append(layer)

    def compile(self, loss_fn, optimizer):
        """
        Compiles the model with a loss function and an optimizer.

        Args:
            loss_fn: An instance of a loss function class (e.g., CrossEntropyLoss).
            optimizer: An instance of an optimizer class (e.g., SGD).
        """
        self.loss_fn = loss_fn
        self.optimizer = optimizer

    def forward(self, X):
        """
        Performs the forward pass through the model.

        Args:
            X (numpy.ndarray): Input data.

        Returns:
            numpy.ndarray: Output from the final layer.
        """
        for layer in self.layers:
            X = layer.forward(X)
        return X

    def backward(self, y_one_hot, y_pred):
        """
        Performs the backward pass through the model.

        Args:
            y_one_hot (numpy.ndarray): True labels.
            y_pred (numpy.ndarray): Predicted outputs.
        """
        # Compute the initial gradient from the loss function
        dL_dout = self.loss_fn.backward(y_one_hot, y_pred)
        # Backpropagate through the layers
        for layer in reversed(self.layers):
            dL_dout = layer.backward(dL_dout)

    def train(self, X_train, y_one_hot, epochs=10, batch_size=10):
        """
        Trains the model on the training data.

        Args:
            X_train (numpy.ndarray): Training input data.
            y_one_hot (numpy.ndarray): Training labels (one-hot encoded).
            epochs (int): Number of training epochs.
            batch_size (int): Size of each training batch (default is 10).
        """
        for epoch in range(epochs):
            for i in range(0, len(X_train), batch_size):
                X_batch = X_train[i:i + batch_size]
                y_batch = y_one_hot[i:i + batch_size]

                # Forward pass
                y_pred = self.forward(X_batch)

                # Compute loss
                loss = self.loss_fn.forward(y_batch, y_pred)

                # Backward pass
                self.backward(y_batch, y_pred)

                # Update parameters using the optimizer
                self.optimizer.step(self)

            print(f"Epoch [{epoch + 1}/{epochs}], Loss: {loss:.4f}")

    def predict(self, X):
        """
        Makes predictions for the input data.

        Args:
            X (numpy.ndarray): Input data.

        Returns:
            numpy.ndarray: Predicted outputs.
        """
        return self.forward(X)

    def evaluate(self, X_test, y_test):
        """
        Evaluates the model on the test data.

        Args:
            X_test (numpy.ndarray): Test input data.
            y_test (numpy.ndarray): Test labels (one-hot encoded).

        Returns:
            float: Loss on the test data.
        """
        y_pred = self.forward(X_test)
        loss = self.loss_fn.forward(y_test, y_pred)
        return loss

# have to write a load and save method
    def save(self, filename):
        """
        Saves the model to a CSV file.

        Args:
            filename (str): The name of the file to save the model.
        """
        for i, layer in enumerate(self.layers):
            if hasattr(layer, 'weights'):
                np.savetxt(f"{filename}_layer_{i}_weights.csv", layer.weights, delimiter=",")
            if hasattr(layer, 'biases'):
                np.savetxt(f"{filename}_layer_{i}_biases.csv", layer.biases, delimiter=",")

    def load(self, filename):
        """
        Loads the weights and biases from CSV files and the training data.
        Computes predictions and saves them to a CSV file.
    
        Args:
            filename (str): The base name of the files for weights and biases.
        """
        # Load weights and biases for each layer
        weights_trained = []
        biases_trained = []
        for i, layer in enumerate(self.layers):
            if hasattr(layer, 'weights'): 
                weights_layer = np.loadtxt(f"{filename}_layer_{i}_weights.csv", delimiter=",")
                weights_trained.append(weights_layer)
            if hasattr(layer, 'biases'):
                biases_layer = np.loadtxt(f"{filename}_layer_{i}_biases.csv", delimiter=",")
                biases_trained.append(biases_layer)
            else :
                weights_trained.append(None) 
                biases_trained.append(None)
    
        # Load X_train data (assuming it's stored in a specific CSV file)
        X_train_for_pred = x_train
    
        # Forward pass through each layer
        a_prev = X_train_for_pred
        for i, layer in enumerate(self.layers):
            if isinstance(layer, Linear):  # If the layer is a Linear layer
                z = np.dot(a_prev, weights_trained[i]) + biases_trained[i]  # Linear transformation
                a_prev = z  # Pass the linear output to the activation function
            elif isinstance(layer, ReLU):  # If the layer is a ReLU layer
                a_prev = layer.forward(a_prev)  # Apply ReLU activation
            elif isinstance(layer, Softmax):  # If the layer is a Softmax layer
                a_prev = layer.forward(a_prev)  # Apply Softmax activation
    
        y_predicted = a_prev  # Final predictions
        header_row_y_pred = np.arange(0,10)
        y_predicted = np.insert(y_predicted, 0, header_row_y_pred, axis=0)

        # Save predictions to CSV
        np.savetxt("y_predicted.csv", y_predicted, delimiter=",")
        print("Predictions saved to y_predicted.csv")


In [13]:
# Create an instance of the model
model = Model()

# Add layers to the model (assuming you have defined layers like Linear, ReLU, etc.)
model.add_layer(Linear(input_size=784, output_size=128))  # Example layer
model.add_layer(ReLU())
model.add_layer(Linear(input_size=128,output_size=10))   # Output layer # have to give argument input size 128 and output size 10
model.add_layer(Softmax())

# Compile the model with a loss function and an optimizer
model.compile(loss_fn=CrossEntropyLoss(), optimizer=SGD(learning_rate=0.01))

# Train the model on your training data
model.train(x_train, y_one_hot, epochs=100, batch_size=20)

# Save the model to CSV files
model.save('my_model')

Epoch [1/100], Loss: 0.5836
Epoch [2/100], Loss: 0.3024
Epoch [3/100], Loss: 0.2254
Epoch [4/100], Loss: 0.2038
Epoch [5/100], Loss: 0.1936
Epoch [6/100], Loss: 0.1821
Epoch [7/100], Loss: 0.1732
Epoch [8/100], Loss: 0.1603
Epoch [9/100], Loss: 0.1530
Epoch [10/100], Loss: 0.1386
Epoch [11/100], Loss: 0.1286
Epoch [12/100], Loss: 0.1229
Epoch [13/100], Loss: 0.1162
Epoch [14/100], Loss: 0.1073
Epoch [15/100], Loss: 0.0994
Epoch [16/100], Loss: 0.0957
Epoch [17/100], Loss: 0.0856
Epoch [18/100], Loss: 0.0797
Epoch [19/100], Loss: 0.0743
Epoch [20/100], Loss: 0.0726
Epoch [21/100], Loss: 0.0686
Epoch [22/100], Loss: 0.0664
Epoch [23/100], Loss: 0.0621
Epoch [24/100], Loss: 0.0586
Epoch [25/100], Loss: 0.0532
Epoch [26/100], Loss: 0.0517
Epoch [27/100], Loss: 0.0487
Epoch [28/100], Loss: 0.0459
Epoch [29/100], Loss: 0.0425
Epoch [30/100], Loss: 0.0395
Epoch [31/100], Loss: 0.0383
Epoch [32/100], Loss: 0.0379
Epoch [33/100], Loss: 0.0363
Epoch [34/100], Loss: 0.0335
Epoch [35/100], Loss: 0

In [14]:
model.load('my_model')

Predictions saved to y_predicted.csv


In [27]:
predictions = np.zeros(42000)
pred_df= pd.read_csv('y_predicted.csv')
pred = pred_df.to_numpy() 
predictions = np.argmax(pred, axis=1)
train_pred_df = pd.DataFrame(predictions)
train_pred_df_squeezed = train_pred_df.squeeze()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,3.86e-07,0.999,8.42e-06,1.34e-05,2.1e-05,3.39e-05,7.74e-05,7.08e-06,0.00114,2.06e-07
1,1.0,9.9e-16,2.83e-07,2.56e-10,8.38e-12,1.09e-07,1.34e-07,8.79e-09,6.22e-09,4.05e-05
2,2.7e-10,0.999,1.47e-05,0.000209,1.76e-07,5.3e-06,3.01e-06,1.58e-05,0.00108,4.7e-07
3,3.9e-06,5.79e-09,0.000103,6.99e-07,0.999,1.45e-05,0.000175,2.73e-05,3.75e-05,0.00025
4,1.0,3.46e-16,3.99e-07,1.63e-11,5.37e-15,1.79e-08,1.83e-09,1.58e-10,5.54e-09,1.28e-07


In [79]:
correct_predictions = (y_train == train_pred_df_squeezed)
num_correct = np.sum(correct_predictions)
accuracy = num_correct / y_train.shape[0]
print(f"Number of Correct Predictions: {num_correct}")
print(f"Accuracy: {accuracy:.4f}")

Number of Correct Predictions: 41277
Accuracy: 0.9828


In [103]:
test_df =sigmoid(test_df)
y_test_pred_logits = model.predict(test_df)
test_predictions = np.argmax(y_test_pred_logits, axis=1)
test_pred_df = pd.DataFrame(test_predictions)
np.savetxt("test_predictions.csv", test_pred_df, delimiter=",")