In [81]:
from google.colab import drive
import numpy as np

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [82]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

data = pd.read_json("/content/drive/MyDrive/News_Category_Dataset_v3.json", lines=True)

top_categories = data['category'].value_counts().head(12).index.tolist()
print(top_categories)
data_top12 = data[data['category'].isin(top_categories)]

X = data_top12['headline']
y = data_top12['category']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=42)


tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
print(X_train_tfidf.shape)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


['POLITICS', 'WELLNESS', 'ENTERTAINMENT', 'TRAVEL', 'STYLE & BEAUTY', 'PARENTING', 'HEALTHY LIVING', 'QUEER VOICES', 'FOOD & DRINK', 'BUSINESS', 'COMEDY', 'SPORTS']
(94684, 5000)


In [83]:
# def oneHotConvert(y,classes):
#   y=y.astype(int)
#   encoded_output = np.zeros((len(y), classes))
#   for i in range(len(y)):
#     # encoded_output[i][y[i]]=1

#   return encoded_output

In [84]:
# y_train=oneHotConvert(y_train,12)
# y_test=oneHotConvert(y_test,12)
# y_val=oneHotConvert(y_val,12)
# print(y_train.shape)

In [85]:
class NeuralNetwork:

    def __init__(self, layers):
        self.layers = layers

    # forward pass
    def forward_pass(self, inputs):
        activations = inputs
        for layer in self.layers:
            activations = layer.forward_pass(activations, saved_weights=None)
        return activations

    # backward pass
    def backprop(self, outputs):
        gradients = outputs
        for layer in reversed(self.layers):
            gradients = layer.backprop(gradients)

    # applying stochastic gradient descent (SGD)
    def apply_sgd(self):
        for layer in self.layers:
            layer.apply_sgd()

    # applying Adam optimizer
    def apply_adam(self):
        for layer in self.layers:
            layer.apply_adam()

    # changing learning rate alpha
    def change_learning_rate(self):
        for layer in self.layers:
            layer.change_learning_rate()

    # saving weights
    def save_parameters(self):
        for index, layer in enumerate(self.layers):
            layer.save_parameters()

    # predicting after loading weights
    def predict(self, inputs):
        activations = inputs
        for layer in self.layers:
            activations = layer.forward_pass(activations, saved_weights=1)
        return activations


In [86]:
class AdamOptimizer:
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.first_moment = None
        self.second_moment = None
        self.time_step = 0

    def update_parameters(self, gradients):
        if self.first_moment is None:
            self.first_moment = np.zeros_like(gradients)
            self.second_moment = np.zeros_like(gradients)

        self.time_step += 1
        self.first_moment = self.beta1 * self.first_moment + (1 - self.beta1) * gradients
        self.second_moment = self.beta2 * self.second_moment + (1 - self.beta2) * (gradients ** 2)
        m_corrected = self.first_moment / (1 - self.beta1 ** self.time_step)
        v_corrected = self.second_moment / (1 - self.beta2 ** self.time_step)
        return self.learning_rate * m_corrected / (np.sqrt(v_corrected) + self.epsilon)


In [87]:
class SoftmaxClassifier:

    def __init__(self, num_classes):
        self.num_classes = num_classes

    def encode_one_hot(self, actual_pred):
        # Example of one-hot encoding with numpy
        one_hot_encoded = np.zeros(self.num_classes)
        one_hot_encoded[actual_pred] = 1
        return one_hot_encoded

    def forward_pass(self, logits, saved_weights=None):
        # Adjust the logits to avoid numerical instability
        shifted_logits = logits - np.max(logits, axis=0, keepdims=True)

        # Compute exponentiated values of the adjusted logits
        exp_values = np.exp(shifted_logits)

        # Normalize the exponentiated values to get softmax probabilities
        self.softmax_probabilities = exp_values / np.sum(exp_values, axis=0, keepdims=True)

        return self.softmax_probabilities

    def backprop(self, target_labels):
        # Compute one-hot encoded labels
        one_hot_labels = self.encode_one_hot(target_labels)

        # Calculate gradient of the loss function
        gradient = (self.softmax_probabilities - one_hot_labels)
        return gradient

    def apply_sgd(self):
        # Placeholder for applying stochastic gradient descent updates
        pass

    def update_learning_rate(self):
        # Placeholder for updating the learning rate
        pass

    def apply_adam(self):
        # Placeholder for applying Adam optimizer updates
        pass

    def save_parameters(self):
        # Placeholder for saving parameters
        pass


In [88]:
class FlattenLayer:

    def __init__(self):
        pass

    def forward_pass(self, input_data, saved_weights=None):
        # Store the original shape of the data for use during backpropagation
        self.original_shape = input_data.shape

        # Flatten the input data into a 1D array
        flattened_data = input_data.flatten()
        return flattened_data

    def backprop(self, gradient):
        # Reshape the gradient to the original input shape during backpropagation
        return gradient.reshape(self.original_shape)

    def apply_sgd(self):
        # Placeholder for applying stochastic gradient descent updates
        pass

    def update_learning_rate(self):
        # Placeholder for updating the learning rate
        pass

    def apply_adam(self):
        # Placeholder for applying Adam optimizer updates
        pass

    def save_parameters(self):
        # Placeholder for saving parameters
        pass


In [89]:
class ReLUActivation:

    def __init__(self):
        pass

    def forward_pass(self, inputs, saved_weights=None):
        self.inputs = inputs
        return np.maximum(0, inputs)

    def derivative(self):
        return np.where(self.inputs > 0, 1, 0)

    def backprop(self, gradient_from_next_layer):
        return gradient_from_next_layer * self.derivative()

    def apply_sgd(self):
        pass

    def update_learning_rate(self):
        pass

    def apply_adam(self):
        pass

    def save_parameters(self):
        pass


In [90]:
class LinearLayer:

    def __init__(self, input_dim, output_dim, learning_rate=0.01, layer_index=0, regularization=None, regularization_penalty=0):
        self.weights = np.random.randn(input_dim, output_dim) / (input_dim * output_dim)
        self.biases = np.zeros((output_dim,))
        self.weight_optimizer = AdamOptimizer(learning_rate=learning_rate)
        self.bias_optimizer = AdamOptimizer(learning_rate=learning_rate)
        self.learning_rate = learning_rate
        self.layer_index = layer_index
        self.regularization = regularization
        self.regularization_penalty = regularization_penalty

    def forward_pass(self, input_data, saved_weights=None):
        if saved_weights is not None:
            saved_data = np.load(f'/content/drive/MyDrive/Colab Notebooks/Saved_Models/Linear_layer{self.layer_index}.npz')
            self.weights = saved_data['arr1']
            self.biases = saved_data['arr2']

        self.input_data = input_data
        self.z = np.dot(input_data, self.weights) + self.biases
        return self.z

    def backprop(self, grad_previous):
        batch_size = self.input_data.shape[0]
        self.grad_weights = np.dot(self.input_data.reshape(-1, 1), grad_previous.reshape(1, -1))
        self.grad_biases = grad_previous.sum(axis=0) / batch_size
        self.grad_input = np.dot(grad_previous, self.weights.T)

        if self.regularization == 'l1':
            grad_weights += self.regularization_penalty * np.sign(self.weights)
            grad_biases += self.regularization_penalty * np.sign(self.biases)
        elif self.regularization == 'l2':
            grad_weights += 2 * self.regularization_penalty * self.weights
            grad_biases += 2 * self.regularization_penalty * self.biases
        elif self.regularization == 'elastic':
            grad_weights += self.regularization_penalty * (0.5 * np.sign(self.weights) + 0.5 * self.weights)
            grad_biases += self.regularization_penalty * (0.5 * np.sign(self.biases) + 0.5 * self.biases)

        return self.grad_input

    def apply_sgd(self):
        self.weights -= self.learning_rate * self.grad_weights
        self.biases -= self.learning_rate * self.grad_biases

    def apply_adam(self):
        self.weights -= self.weight_optimizer.update(self.grad_weights)
        self.biases -= self.bias_optimizer.update(self.grad_biases)

    def update_learning_rate(self):
        self.learning_rate /= 5

    def save_parameters(self):
        np.savez(f'/content/drive/MyDrive/Colab Notebooks/Saved_Models/Linear_layer{self.layer_index}.npz', arr1=self.weights, arr2=self.biases)


In [91]:
class SelfAttentionLayer:

    def __init__(self, dim, key_dim, learning_rate=0.01, layer_index=0, regularization=None, regularization_penalty=0):
        self.key_dim = key_dim
        self.weights_key = np.random.randn(dim, key_dim) / (dim * key_dim)
        print(self.weights_key.shape)
        self.weights_query = np.random.randn(dim, key_dim) / (dim * key_dim)
        self.weights_value = np.random.randn(dim, key_dim) / (dim * key_dim)
        self.optimizer_key = AdamOptimizer(learning_rate=learning_rate)
        self.optimizer_query = AdamOptimizer(learning_rate=learning_rate)
        self.optimizer_value = AdamOptimizer(learning_rate=learning_rate)
        self.learning_rate = learning_rate
        self.layer_index = layer_index
        self.regularization = regularization
        self.regularization_penalty = regularization_penalty

    def compute_softmax(self, X):
        shift = X - np.max(X, axis=1, keepdims=True)
        exps = np.exp(shift)
        output = exps / np.sum(exps, axis=1, keepdims=True)
        return output

    def forward_pass(self, X, saved_weights=None):
        if saved_weights is not None:
            saved_data = np.load(f'/content/drive/MyDrive/Colab Notebooks/Saved_Models/SelfAttention_layer{self.layer_index}.npz')
            self.weights_key = saved_data['arr1']
            self.weights_query = saved_data['arr2']
            self.weights_value = saved_data['arr3']
        self.X=X
        # print(X.shape)
        # print(self.weights_query.shape)
        self.Q = X @self.weights_query
        # self.Q = np.matmul(X, self.weights_query)
        self.K = np.matmul(X, self.weights_key)
        self.V = np.matmul(X, self.weights_value)

        scores = np.matmul(self.Q, self.K.T) / np.sqrt(self.key_dim)
        self.attention_weights = self.compute_softmax(scores)
        output = np.matmul(self.attention_weights, self.V)

        return output

    def backprop(self, gradient_previous):
        self.grad_weights_value = np.matmul(np.matmul(self.X.T, self.attention_weights.T), gradient_previous)
        t1 = np.multiply(self.attention_weights, np.matmul(gradient_previous, self.V.T))
        t2 = self.Q - np.matmul(self.attention_weights, self.Q)
        self.grad_weights_key = (1/np.sqrt(self.key_dim)) * np.matmul(np.matmul(self.X.T, t1), t2)

        t3 = np.sum(np.multiply(self.attention_weights, np.matmul(gradient_previous, self.V.T)), axis=1)
        t4 = np.multiply(t3, self.attention_weights)
        t5 = np.multiply(self.attention_weights, np.matmul(gradient_previous, self.V.T)) - t4
        self.grad_weights_query = (1/np.sqrt(self.key_dim)) * np.matmul(np.matmul(self.X.T, t5), self.K)

        return 0

    def apply_sgd(self):
        self.weights_key -= self.learning_rate * self.grad_weights_key
        self.weights_query -= self.learning_rate * self.grad_weights_query
        self.weights_value -= self.learning_rate * self.grad_weights_value

    def apply_adam(self):
        self.weights_key -= self.optimizer_key.update(self.grad_weights_key)
        self.weights_query -= self.optimizer_query.update(self.grad_weights_query)
        self.weights_value -= self.optimizer_value.update(self.grad_weights_value)

    def update_learning_rate(self):
        self.learning_rate /= 5

    def save_parameters(self):
        np.savez(f'/content/drive/MyDrive/Colab Notebooks/Saved_Models/SelfAttention_layer{self.layer_index}.npz', arr1=self.weights_key, arr2=self.weights_query, arr3=self.weights_value)


In [92]:
class CrossEntropyLoss:

    def _init_(self):
        pass
    def compute(self, A, Y):
        ce_loss = - np.log(A[Y])
        return ce_loss

In [93]:
# class PCA:
#     def _init_(self, n_components=125):

#         self.n_components = n_components
#         self.mean = None
#         self.components = None
#         self.explained_variance_ratio = None

#     def fit(self, X):

#         n_samples, n_features = X.shape

#         # Subtract the mean from the data
#         self.mean = np.mean(X, axis=0)
#         X_centered = X - self.mean

#         # Calculate the covariance matrix
#         covariance_matrix = np.cov(X_centered.T)

#         # Calculate the eigenvalues and eigenvectors
#         eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)

#         eigenvectors = eigenvectors.T

#         idxs = np.argsort(eigenvalues)[::-1]
#         eigenvalues = eigenvalues[idxs]
#         eigenvectors = eigenvectors[idxs]

#         self.components = eigenvectors[:self.n_components]

#         # Sort the eigenvalues and eigenvectors in descending order

#         self.explained_variance_ratio = eigenvalues[idxs[:self.n_components]] / np.sum(eigenvalues)

#         return self

#     def transform(self, X):

#         if self.components is None:
#             raise ValueError("You must fit the PCA model first.")

#         X_centered = X - self.mean
#         X_projected = np.dot(X_centered, self.components.T)

#         return X_projected

#     def fit_transform(self, X):

#         self.fit(X)
#         X_projected = self.transform(X)

#         return X_projected

In [94]:
def positional_encoding(position, d_model):
    """
    Args:
    position: int, the length of the sequence.
    d_model: int, the dimensionality of the model's output.

    Returns:
    A numpy array shape (1, position, d_model) containing the positional encodings.
    """
    # Create an array of positions (0, 1, ..., position-1) and reshape it to use broadcasting
    angle_rads = np.arange(position)[:, np.newaxis]

    # Compute the frequencies
    angle_rates = 1 / np.power(10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / d_model)

    # Apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2] * angle_rates[:, 0::2])

    # Apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2] * angle_rates[:, 1::2])

    # Add a new axis for batch size at the beginning
    pos_encoding = angle_rads[np.newaxis, ...]

    return pos_encoding

In [95]:
def zeroMean(X):
    X-=np.mean(X, axis=0)
    return X

def PCA(X, k):
    X=zeroMean(X)
    V= np.cov(X.T)
    print(X.shape)
    print(V.shape)
    eig_value, eig_vec=np.linalg.eig(V)
    sorted_eig_values=np.argsort(eig_value)[::-1]
    vectors=eig_vec[:, sorted_eig_values[:k]]
    W=vectors
    print(W.shape)
    # print(W[0])
    Z=np.dot(X, W)
    print(Z.shape)
    return Z

In [96]:
x_train_transform=X_train_tfidf.reshape(X_train_tfidf.shape[0], -1)
# abc=zeroMean(x_train_transform)
xyz=PCA(x_train_transform,100)

(94684, 5000)
(5000, 5000)
(5000, 100)
(94684, 100)


In [97]:
print(xyz.shape)

(94684, 100)


In [98]:
X_test_tfidf_dense = X_test_tfidf.toarray()
X_test_tfidf = X_test_tfidf_dense[:, :, np.newaxis]


In [99]:
encoding=positional_encoding(100,1)

In [100]:
X_train_tfidf_dense = np.asarray(xyz)
X_train_tfidf = X_train_tfidf_dense[:, :, np.newaxis]

In [101]:
X_train_tfidf +=encoding

In [102]:
print(X_train_tfidf.shape)

(94684, 100, 1)


In [106]:
completeNetwork = NeuralNetwork([
    SelfAttentionLayer(dim=1, key_dim=20, learning_rate=0.2),
    FlattenLayer(),
    LinearLayer(input_dim=2000, output_dim=256, learning_rate=0.2),
    ReLUActivation(),
    LinearLayer(input_dim=256, output_dim=64, learning_rate=0.2),
    ReLUActivation(),
    LinearLayer(input_dim=64, output_dim=12, learning_rate=0.2),
    SoftmaxClassifier(num_classes=12)
])
crossEntropyLoss = CrossEntropyLoss()
total_loss = 0
num_epochs = 5
num_images = 500

(1, 20)


In [107]:
for epoch in range(num_epochs):
    total_loss = 0
    predicted_labels = []
    print(X_train_tfidf.shape)
    for index, data in enumerate(X_train_tfidf):
        # print("before ,",data.shape)
        # if index %1000==0:
        #   print(index)
        probabilities = completeNetwork.forward_pass(data)
        predicted_labels.append(np.argmax(probabilities))
        # k=np.argmax(y_train[index])
        total_loss += crossEntropyLoss.compute(probabilities, y_train[index])
        completeNetwork.backprop(k)
        completeNetwork.apply_sgd()
    accuracy = np.mean(predicted_labels == y_train)
    average_loss = total_loss /X_train_tfidf.shape[0]
    print(f"Epoch {epoch+1}: Accuracy is {accuracy:.20%}, Loss is {average_loss:.20f}")

(94684, 100, 1)
Epoch 1: Accuracy is 7.17333445988762541390%, Loss is 12.82558694215131112060
(94684, 100, 1)
Epoch 2: Accuracy is 7.17122217058848399063%, Loss is 14.23224168024199265403
(94684, 100, 1)
Epoch 3: Accuracy is 7.17122217058848399063%, Loss is 14.75449891162117133092
(94684, 100, 1)
Epoch 4: Accuracy is 7.17122217058848399063%, Loss is 15.09275575986468354017
(94684, 100, 1)
Epoch 5: Accuracy is 7.17122217058848399063%, Loss is 15.34386915552108199279
