In [15]:
import os
import pandas as pd

import copy, math
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('multi_classification_train.csv')


X_train = df.iloc[1:4000,1:-2]
y_train = df.iloc[1:4000,-1]

def one_hot_encoding(y):
    unique_labels = np.unique(y)
    return unique_labels

print(one_hot_encoding(y_train))

for i in range(len(one_hot_encoding(y_train))):
    df[f'new_column_{i}'] = np.where(df['Class'] == i, 1, 0)

OHE_columns = df.iloc[1:4000, -5:]

def zscore_normalize_features(X):
    """
    computes  X, zcore normalized by column

    Args:
      X (ndarray (m,n))     : input data, m examples, n features

    Returns:
      X_norm (ndarray (m,n)): input normalized by column
      mu (ndarray (n,))     : mean of each feature
      sigma (ndarray (n,))  : standard deviation of each feature
    """
    # find the mean of each column/feature
    mu     = np.mean(X, axis=0)                 # mu will have shape (n,)
    # find the standard deviation of each column/feature
    sigma  = np.std(X, axis=0)                  # sigma will have shape (n,)
    # element-wise, subtract mu for that column from each example, divide by std for that column
    X_norm = (X - mu) / sigma

    return X_norm

X_train1 = zscore_normalize_features(X_train)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))


def compute_cost(X, y, w, b):
  m = X.shape[0]  # Number of examples
  n = OHE_columns.shape[1]  # Number of classes (from your OHE_columns)

  b = b.reshape(1, -1)

  # Calculate the scores (z) for all examples
  z = np.dot(X, w.T) + b # Assuming w is a column vector

  # Calculate the sigmoid for all examples
  h = sigmoid(z + 0.001)  # Assuming sigmoid is already defined

  # Calculate the cost using element-wise multiplication and sum
  cost = -np.sum(OHE_columns * np.log(h)) / m

  return cost


def compute_gradient_softmax(X, y, w, b):
  m = X.shape[0]

  b = b.reshape(1, -1)

  # Calculate scores and probabilities
  z = np.dot(X, w.T) + b
  h = sigmoid(z)

  # Calculate gradients
  dj_dw = (1/m) * X.T @ (OHE_columns * h*(1 - h) + np.log(h))
  dj_db = (1/m) * np.sum(OHE_columns *h* (1 - h) + np.log(h),axis=0).to_numpy()

  return dj_dw.T, dj_db


def gradient_descent(X, y, w_in, b_in, alpha, num_iters):
    """
    Performs batch gradient descent to learn theta. Updates theta by taking
    num_iters gradient steps with learning rate alpha

    Args:
      X :    (array_like Shape (m, n)
      y :    (array_like Shape (m,))
      w_in : (array_like Shape (n, k))  Initial values of parameters of the model (weight matrix)
      b_in : (array_like Shape (k,))   Initial value of parameter of the model (bias vector)
      alpha : (float)                 Learning rate
      num_iters : (int)              number of iterations to run gradient descent

    Returns:
      w : (array_like Shape (n, k)) Updated values of parameters of the model after
          running gradient descent (weight matrix)
      b : (array_like Shape (k,))   Updated value of parameter of the model after
          running gradient descent (bias vector)
    """

    # number of training examples
    m = len(X)

    # An array to store cost J and w's at each iteration primarily for graphing later
    cost_history = []
    w_history = []

    w = copy.deepcopy(w_in)  #avoid modifying global w within function
    b = b_in

    for i in range(num_iters):

        # Calculate the gradient and update the parameters
        dj_dw, dj_db = compute_gradient_softmax(X, y, w, b)

        # Update Parameters using gradient descent
        w = w - alpha * dj_dw
        b = b - alpha * dj_db

        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion
            cost =  compute_cost(X, y, w, b)
            cost_history.append(cost)

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters/10) == 0:
            w_history.append(w)


    return w, b, cost_history #return w and J,w history for graphing


# initialize parameters
initial_w = np.random.rand(len(one_hot_encoding(y_train)), len(X_train.columns))
initial_b = np.random.rand(len(one_hot_encoding(y_train)))
# some gradient descent settings
iterations = 4000
alpha = 5.0e-6
# run gradient descent
w_final, b_final, J_hist = gradient_descent(X_train1, y_train, initial_w, initial_b,
                                                    alpha, iterations)
print(f"b found by gradient descent: {b_final}")
print(f"w found by gradient descent: \n{w_final}")






[0 1 2 3 4]


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=

b found by gradient descent: [0.34162952 0.2173062  0.70169814 0.8856784  0.54308429]
w found by gradient descent: 
              Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  \
new_column_0   0.739481   0.184493   0.262320   0.264505   0.990217   
new_column_1   0.308049   0.782948   0.404819   0.654449   0.176641   
new_column_2   0.250160   0.582116   0.255697   0.676932   0.863369   
new_column_3   0.821945   0.456156   0.076042   0.577391   0.973486   
new_column_4   0.438816   0.210061   0.784030   0.928969   0.777950   

              Feature_6  Feature_7  Feature_8  Feature_9  Feature_10  \
new_column_0   0.070160   0.791444   0.734243   0.431352    0.782789   
new_column_1   0.161071   0.466801   0.030657   0.160545    0.115603   
new_column_2   0.183715   0.198659   0.864716   0.725849    0.248161   
new_column_3   0.455129   0.944169   0.470270   0.893470    0.415264   
new_column_4   0.706259   0.927731   0.962724   0.330471    0.001955   

              Feature_11

  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)


In [18]:
# CALCULATING THE PROBABILITY

probability = np.zeros((len(X_train),5))


# Initialize sum_ with zeros to store the accumulated probabilities
sum_ = np.zeros((X_train.shape[0], len(w_final)))

# Iterate over each set of weights and biases
for i in range(len(w_final)):
    # Calculate the sigmoid of the dot product for the current class
    sum_[:, i] = np.exp(np.dot(X_train, w_final.iloc[i]) + b_final[i])

# Calculate probabilities for each class by normalizing over rows
for i in range(len(one_hot_encoding(y_train))):
    probability[:, i] = sum_[:, i] / np.sum(sum_, axis=1)

print(probability[:30,:])


[[2.21830399e-090 1.66932158e-072 1.65070273e-123 7.23578083e-116
  1.00000000e+000]
 [1.44860757e-032 1.31696624e-053 4.85436755e-020 1.00000000e+000
  2.69102923e-035]
 [4.30707715e-102 1.81835193e-108 1.89532683e-054 1.00000000e+000
  1.13808551e-011]
 [1.29345137e-032 1.12512141e-035 9.49311986e-095 1.51628898e-036
  1.00000000e+000]
 [1.94604389e-043 4.84322185e-043 3.20768089e-021 2.35455923e-005
  9.99976454e-001]
 [1.35961742e-050 4.20862334e-006 4.62619314e-066 1.59527777e-055
  9.99995791e-001]
 [2.73771204e-143 1.58533411e-030 4.70755134e-118 6.93789156e-116
  1.00000000e+000]
 [4.55005308e-020 1.54055730e-101 1.28085086e-073 4.58037479e-064
  1.00000000e+000]
 [2.24461690e-131 2.70311126e-081 3.27573277e-074 2.71992881e-073
  1.00000000e+000]
 [1.19533895e-165 2.16551012e-119 1.11749602e-102 2.34908840e-164
  1.00000000e+000]
 [5.34970108e-033 1.73844907e-109 3.56883721e-019 1.00000000e+000
  4.08098048e-017]
 [3.49512644e-085 6.82101195e-033 3.85703328e-098 3.52063533e-053

  sum_[:, i] = np.exp(np.dot(X_train, w_final.iloc[i]) + b_final[i])
  probability[:, i] = sum_[:, i] / np.sum(sum_, axis=1)


In [19]:
for i in range(probability.shape[0]):
    # Find the index of the maximum element in the current row
    max_col_index = np.argmax(probability[i])

    # Get the maximum element in the current row
    max_element = probability[i, max_col_index]

    # Print the maximum element and its column index
    print(f"Row {i}: Max element = {max_element}, Column index = {max_col_index}, Target value : { y_train.iloc[i]}")

Row 0: Max element = 1.0, Column index = 4, Target value : 2
Row 1: Max element = 1.0, Column index = 3, Target value : 4
Row 2: Max element = 0.9999999999886191, Column index = 3, Target value : 2
Row 3: Max element = 1.0, Column index = 4, Target value : 2
Row 4: Max element = 0.9999764544076546, Column index = 4, Target value : 1
Row 5: Max element = 0.9999957913766585, Column index = 4, Target value : 2
Row 6: Max element = 1.0, Column index = 4, Target value : 3
Row 7: Max element = 1.0, Column index = 4, Target value : 1
Row 8: Max element = 1.0, Column index = 4, Target value : 4
Row 9: Max element = 1.0, Column index = 4, Target value : 2
Row 10: Max element = 1.0, Column index = 3, Target value : 2
Row 11: Max element = 1.0, Column index = 4, Target value : 2
Row 12: Max element = 1.0, Column index = 4, Target value : 3
Row 13: Max element = 1.0, Column index = 3, Target value : 4
Row 14: Max element = 1.0, Column index = 4, Target value : 1
Row 15: Max element = 0.99999999999

In [20]:
# calculating accuracy

count = 0

for i in range(probability.shape[0]):
  if max_col_index == y_train.iloc[i]:
    count += 1
print(f" correct pred: {count}, total prediction: {probability.shape[0]}")

 correct pred: 454, total prediction: 3999


In [None]:
def compute_gradient_softmax(X, y, w, b):

  dj_dw = np.zeros_like(w)
  dj_db = np.zeros_like(b)

  m = X_train.shape[0]

  p,q = w.shape

  for k in range(p):
    for j in range(q):

      for i in range(m):
        dj_dw[i,j]+= sigmoid(np.dot(X_train.iloc[i,:],w[k,:])+b[k])*np.exp(np.dot(X_train.iloc[i,:],w[k,:])+b[k])*X_train.iloc[i,j]

  for i in range(m):
    for j in range(q):
      dj_db[i]+= sigmoid(np.dot(X_train.iloc[i,:],w[j,:])+b[j])*np.exp(np.dot(X_train.iloc[i,:],w[j,:])+b[j])
  return dj_dw/m, dj_db/m