Importing the MNIST dataset

In [245]:
#imports.
import numpy as np
import matplotlib.pyplot as plt
import urllib
from collections import Counter

In [246]:
#retriving the data set from the source.
url = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz'

urllib.request.urlretrieve(url, 'mnist.npz')

with np.load('mnist.npz') as data:
    train_images = data['x_train']
    train_labels = data['y_train']
    test_images = data['x_test']
    test_labels = data['y_test']

In [247]:
image_zero = []
image_one = []
image_two = []

for img in range(len(train_images)):
  if(train_labels[img]==0):
    image_zero.append(train_images[img])
  elif(train_labels[img]==1):
    image_one.append(train_images[img])
  elif(train_labels[img]==2):
    image_two.append(train_images[img])
  else:
    continue

In [248]:
#Vectorizing the Images of Label 0
image_zero = np.array(image_zero)
image_zero_dataset = (image_zero.reshape(len(image_zero),784)).T

#Vectorizing the Images of Label 1
image_one = np.array(image_one)
image_one_dataset = (image_one.reshape(len(image_one),784)).T

#Vectorizing the images of Label 2
image_two = np.array(image_two)
image_two_dataset = (image_two.reshape(len(image_two),784)).T

print(image_zero_dataset.shape)
print(image_one_dataset.shape)
print(image_two_dataset.shape)

(784, 5923)
(784, 6742)
(784, 5958)


In [249]:
def principal_component_analysis(dataset,p):
  #Calculating the mean of the dataset and centralizing the dataset
  mean = np.mean(dataset,axis=1,keepdims=True)
  centralized_dataset = dataset - mean

  #calculating covariance matrix , eigenvectors and respective eigenvalues.
  unbiased_cov = np.matmul(centralized_dataset,centralized_dataset.T)/((dataset.shape)[1]-1)
  eigenvalues , eigenvectors = np.linalg.eigh(unbiased_cov)

  #Normalizing the eigenvalues and sorting according the maximum variance
  norms = np.linalg.norm(eigenvectors, axis=1, keepdims=True)
  eigenvectors = eigenvectors/norms
  sorted_indices = np.argsort(eigenvalues)[::-1]
  sorted_eigenvalues = eigenvalues[sorted_indices]
  sorted_eigenvectors = eigenvectors[:, sorted_indices]

  #Calculating the reduced datamatrix
  principal_components = sorted_eigenvectors[:, :p]
  return np.matmul(principal_components.T,dataset),principal_components

In [250]:
#Number of dimensions p
p = 10

Y_zero,U_zero = principal_component_analysis(image_zero_dataset,p)
Y_one,U_one = principal_component_analysis(image_one_dataset,p)
Y_two,U_two = principal_component_analysis(image_two_dataset,p)

print(Y_zero.shape)

(10, 5923)


In [251]:
Y_zero = Y_zero.T
Y_one = Y_one.T
Y_two = Y_two.T

(5923, 10)
(6742, 10)
(5958, 10)
5923
[ 426.70405666 1227.79011258 1932.6713399   237.25823074  487.4988891
 -261.508726    140.17682899  -41.96176503  343.32886898  113.0417364 ]


In [261]:
dataset = []
label = []

for row in range(len(Y_zero)):
  dataset.append(Y_zero[row])

for row in range(len(Y_one)):
  dataset.append(Y_one[row])

for row in range(len(Y_two)):
  dataset.append(Y_two[row])


dataset = np.array(dataset)

for sz in range(len(Y_zero)):
  label.append(0)
for sz in range(len(Y_one)):
  label.append(1)
for sz in range(len(Y_two)):
  label.append(2)

label = np.array(label)



In [254]:
def find_best_split_numerical(X, y):
    best_gini = float('inf')
    best_feature = None
    best_threshold = None

    for feature in range(X.shape[1]):  # Iterate over each feature
        unique_values = np.unique(X[:, feature])
        for i in range(1, len(unique_values)):  # Iterate over unique values as potential thresholds
            threshold = (unique_values[i - 1] + unique_values[i]) / 2.0
            left_indices = X[:, feature] <= threshold
            right_indices = X[:, feature] > threshold
            gini = calculate_gini_impurity(y[left_indices], y[right_indices])
            if gini < best_gini:
                best_gini = gini
                best_feature = feature
                best_threshold = threshold

    return best_feature, best_threshold

def calculate_gini_impurity(labels_left, labels_right):
    # Calculate Gini impurity for a binary split
    total_samples = len(labels_left) + len(labels_right)
    gini_left = gini_impurity(labels_left) * (len(labels_left) / total_samples)
    gini_right = gini_impurity(labels_right) * (len(labels_right) / total_samples)
    return gini_left + gini_right

def gini_impurity(labels):
    # Calculate Gini impurity for a set of labels
    total_samples = len(labels)
    if total_samples == 0:
        return 0  # If there are no samples, Gini impurity is 0

    # Count the occurrences of each unique label
    label_counts = np.bincount(labels)

    # Calculate the probabilities of each label
    probabilities = label_counts / total_samples

    # Calculate Gini impurity using the formula: 1 - sum(p_i^2)
    gini_impurity = 1 - np.sum(probabilities ** 2)

    return gini_impurity


# Example usage:
best_feature, best_threshold = find_best_split_numerical(dataset, label)
print("Best split feature:", best_feature)
print("Best split threshold:", best_threshold)


Best split feature: 2
Best split threshold: -25.977254274933387


In [256]:
# Assuming X and y are your dataset and labels, and best_feature and best_threshold are obtained from find_best_split_numerical function

# Get the feature values for the best feature
best_feature_values = dataset[:, best_feature]

# Create boolean arrays to identify samples that belong to the left and right subsets
left_indices = best_feature_values <= best_threshold
right_indices = ~left_indices  # Invert the left indices to get the right indices

# Split the dataset and labels into left and right subsets
dataset_left = dataset[left_indices]
label_left = label[left_indices]
dataset_right = dataset[right_indices]
label_right = label[right_indices]


print(len(label_left))
print(len(label_right))

12700
5923


In [None]:
most_common_label_right = Counter(label_right).most_common(1)[0][0]
print(most_common_label_right)
print(label_right)
print(label_left)


In [None]:
best_feature_left , best_threshold_left = find_best_split_numerical(dataset_left,label_left)

print(best_feature_left)
print(best_threshold_left)

In [None]:
# Assuming X and y are your dataset and labels, and best_feature and best_threshold are obtained from find_best_split_numerical function

# Get the feature values for the best feature
best_feature_values = dataset_left[:, best_feature_left]

# Create boolean arrays to identify samples that belong to the left and right subsets
left_indices = best_feature_values <= best_threshold_left
right_indices = ~left_indices  # Invert the left indices to get the right indices

# Split the dataset and labels into left and right subsets
dataset_left1 = dataset_left[left_indices]
label_left1 = label_left[left_indices]
dataset_right1 = dataset_left[right_indices]
label_right1 = label_left[right_indices]

print(len(label_right1))
print(len(label_left1))

In [None]:
most_common_label_left1 = Counter(label_left1).most_common(1)[0][0]
most_common_label_right1 = Counter(label_right1).most_common(1)[0][0]
print(most_common_label_left1)
print(most_common_label_right1)

In [None]:
test_zero = []
test_one = []
test_two = []


for img in range(len(test_images)):
  if(test_labels[img]==0):
    test_zero.append(test_images[img])
  elif(test_labels[img]==1):
    test_one.append(test_images[img])
  elif(test_labels[img]==2):
    test_two.append(test_images[img])
  else:
    continue

In [262]:
#Vectorizing the Images of Label 0
test_zero = np.array(test_zero)
test_zero_dataset = (test_zero.reshape(len(test_zero),784)).T

#Vectorizing the Images of Label 1
test_one = np.array(test_one)
test_one_dataset = (test_one.reshape(len(test_one),784)).T

#Vectorizing the images of Label 2
test_two = np.array(test_two)
test_two_dataset = (test_two.reshape(len(test_two),784)).T

print(test_zero_dataset.shape)
print(test_one_dataset.shape)
print(test_two_dataset.shape)

(784, 980)
(784, 1135)
(784, 1032)


In [None]:
#Number of dimensions p
p = 10

test_Y_zero,test_U_zero = principal_component_analysis(test_zero_dataset,p)
test_Y_one,test_U_one = principal_component_analysis(test_one_dataset,p)
test_Y_two,test_U_two = principal_component_analysis(test_two_dataset,p)

In [None]:
test_Y_zero = test_Y_zero.T
test_Y_one = test_Y_one.T
test_Y_two = test_Y_two.T

In [None]:
print(test_Y_zero.shape)
print(test_Y_one.shape)
print(len(test_Y_two))

In [None]:
test_dataset = []
test_label = []

for row in range(len(test_Y_zero)):
  test_dataset.append(test_Y_zero[row])

for row in range(len(test_Y_one)):
  test_dataset.append(test_Y_one[row])

for row in range(len(test_Y_two)):
  test_dataset.append(test_Y_two[row])


test_dataset = np.array(test_dataset)

for sz in range(len(test_Y_zero)):
  test_label.append(0)
for sz in range(len(test_Y_one)):
  test_label.append(1)
for sz in range(len(test_Y_two)):
  test_label.append(2)

test_label = np.array(test_label)


In [257]:
def classifier(X):
  print(X[2])
  if(X[2]>-25.9):
    return 0
  else:
    if(X[1]<=344.696):
      return 1
    else:
      return 2

In [258]:
correct = 0
total = len(test_dataset)
correct0 = 0
correct1 = 0
correct2 = 0

In [259]:
for i in range(len(test_dataset)):
  sample = test_dataset[i]
  if(classifier(sample)==test_label[i]):
    correct+=1
    if(test_label[i]==0):
      correct0+=1
    elif(test_label[i]==1):
      correct1 += 1
    else:
      correct2+=1

print("accuracy : ",correct/total)
print(correct)
print(correct0)
print(correct1)
print(correct2)

2588.4586598998694
1814.5196525756123
2127.1044580918933
2629.3982346001517
2376.705995953687
1775.5667504300018
1300.2894955130132
2458.818103296443
1755.6299632531336
1310.905266062142
1905.3892328790407
1890.3404042168506
1856.9048524291943
1664.1464844696102
2065.320456418766
1123.9453551394893
2146.044123552536
2046.3148244746756
1682.0393260651945
1169.4807400341242
2407.270755715885
1853.93156239631
2313.431222545463
1130.3214652779216
978.2020525884495
3200.3786426689057
2361.152265543866
1223.7478665606054
2345.4440500807727
1557.4361858597435
1435.9163831177957
1154.8841686492462
794.5723493831855
2274.196850639714
1755.1082153706914
2996.141220859903
2044.9915387768715
703.6539841633888
1838.7029770936117
1846.1031632034817
912.5319059962767
1443.8056132039035
1742.9210740586461
1286.5868575498182
1739.7237891631544
2179.6573730261143
1062.3081890330575
1663.7991352436843
1383.5716306941736
1791.463546797691
1522.9566075732725
2505.4933848502847
1384.7237489408212
1305.50925