In [14]:
#Library
import pandas as pd
import numpy as np

#data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#Machine Learning Modules
import statsmodels.api as sm 
from scipy import stats
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier  
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier  
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split


In [2]:
# Load the dataset
df = pd.read_csv('letters.csv')

In [3]:
df.shape

(42000, 46)

In [4]:
df.head()

Unnamed: 0,label,pixel43,pixel44,pixel92,pixel124,pixel125,pixel126,pixel127,pixel128,pixel129,...,pixel329,pixel351,pixel410,pixel411,pixel412,pixel413,pixel414,pixel415,pixel416,pixel417
0,1,0,0,0,0,0,0,0,0,0,...,0,254,0,0,0,0,0,0,0,0
1,0,0,0,0,137,137,192,86,72,1,...,254,0,0,75,254,254,254,17,0,0
2,1,0,0,0,3,141,139,3,0,0,...,0,184,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,94,255,69,0,0,0,0,0
4,0,0,0,0,155,254,254,254,157,30,...,253,0,0,0,223,253,253,253,129,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Data columns (total 46 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   label     42000 non-null  int64
 1   pixel43   42000 non-null  int64
 2   pixel44   42000 non-null  int64
 3   pixel92   42000 non-null  int64
 4   pixel124  42000 non-null  int64
 5   pixel125  42000 non-null  int64
 6   pixel126  42000 non-null  int64
 7   pixel127  42000 non-null  int64
 8   pixel128  42000 non-null  int64
 9   pixel129  42000 non-null  int64
 10  pixel130  42000 non-null  int64
 11  pixel131  42000 non-null  int64
 12  pixel132  42000 non-null  int64
 13  pixel133  42000 non-null  int64
 14  pixel134  42000 non-null  int64
 15  pixel135  42000 non-null  int64
 16  pixel136  42000 non-null  int64
 17  pixel137  42000 non-null  int64
 18  pixel138  42000 non-null  int64
 19  pixel146  42000 non-null  int64
 20  pixel147  42000 non-null  int64
 21  pixel148  42000 non-null  int64
 22

In [6]:
df.isnull().sum()

label       0
pixel43     0
pixel44     0
pixel92     0
pixel124    0
pixel125    0
pixel126    0
pixel127    0
pixel128    0
pixel129    0
pixel130    0
pixel131    0
pixel132    0
pixel133    0
pixel134    0
pixel135    0
pixel136    0
pixel137    0
pixel138    0
pixel146    0
pixel147    0
pixel148    0
pixel149    0
pixel150    0
pixel151    0
pixel152    0
pixel153    0
pixel154    0
pixel155    0
pixel156    0
pixel157    0
pixel158    0
pixel159    0
pixel160    0
pixel327    0
pixel328    0
pixel329    0
pixel351    0
pixel410    0
pixel411    0
pixel412    0
pixel413    0
pixel414    0
pixel415    0
pixel416    0
pixel417    0
dtype: int64

In [7]:
# Separate features (X) and labels (y)
x = df.drop('label', axis=1)
y = df['label']

In [29]:
## Compute the number of examples of each digit
digits, counts = np.unique(y, return_counts=True)
print("Data set distribution:")
print(dict(zip(digits, counts)))

Data set distribution:
{0: 4132, 1: 4684, 2: 4177, 3: 4351, 4: 4072, 5: 3795, 6: 4137, 7: 4401, 8: 4063, 9: 4188}


In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# K-NN

In [15]:
# Function to build and evaluate KNN model
def build_knn_model(k):
    # Create the KNN classifier
    knn = KNeighborsClassifier(n_neighbors=k)

    # Train the model
    knn.fit(X_train, y_train)

    # Predict the labels for the test set
    y_pred = knn.predict(X_test)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Test the KNN model with different values of k
k_values = [3, 5]  # You can change these values or add more

for k in k_values:
    accuracy = build_knn_model(k)
    print(f'Accuracy with k={k}: {accuracy}')


Accuracy with k=3: 0.6270238095238095
Accuracy with k=5: 0.6503571428571429


# Random Forests

In [16]:
# Function to build and evaluate Random Forest model
def build_random_forest_model(num_trees):
    # Create the Random Forest classifier
    rf = RandomForestClassifier(n_estimators=num_trees, random_state=43)

    # Train the model
    rf.fit(X_train, y_train)

    # Predict the labels for the test set
    y_pred = rf.predict(X_test)

    # Calculate the accuracy of the model
    accuracy_rf = accuracy_score(y_test, y_pred)

    return accuracy_rf

# Test the Random Forest model with different values of the number of trees
num_tree = [100, 200]  # You can change these values or add more

for num_trees in num_tree:
    accuracy = build_random_forest_model(num_trees)
    print(f'Accuracy with {num_trees} trees: {accuracy}')


Accuracy with 100 trees: 0.6976190476190476
Accuracy with 200 trees: 0.7013095238095238


# Neural Network

In [17]:
#Activation Function 1 - RELU
def relu(x, derivative=False):
    if derivative:
        return 1. * (x > 0)
    return np.maximum(0, x)

In [18]:
#Activation Function 2 - Sigmoid
def sigmoid(x, derivative=False):
    if derivative:
        return sigmoid(x) * (1 - sigmoid(x))
    return 1 / (1 + np.exp(-x))


In [27]:
activation_functions = [relu, sigmoid]  # Test two activation functions: ReLU and sigmoid

for activation in activation_functions:
    np.random.seed(42)  # Set a random seed for reproducibility
    num_features = X_train.shape[1]
    num_classes = 10  # Assuming 10 classes (numbers 0-9)
    # Initialize weights and biases
    W1 = np.random.randn(num_features, 64) * 0.01
    b1 = np.zeros((1, 64))
    W2 = np.random.randn(64, 32) * 0.01
    b2 = np.zeros((1, 32))
    W3 = np.random.randn(32, num_classes) * 0.01
    b3 = np.zeros((1, num_classes))

    learning_rate = 0.01
    num_epochs = 10
    batch_size = 32

    for epoch in range(num_epochs):
        for i in range(0, len(X_train), batch_size):
            # Mini-batch training
            X_batch = X_train[i:i+batch_size]
            y_batch = y_train[i:i+batch_size]

            # Forward propagation
            Z1 = np.dot(X_batch, W1) + b1
            A1 = activation(Z1)
            Z2 = np.dot(A1, W2) + b2
            A2 = activation(Z2)
            Z3 = np.dot(A2, W3) + b3
            A3 = np.exp(Z3) / np.sum(np.exp(Z3), axis=1, keepdims=True)

            # Compute loss
            num_examples = X_batch.shape[0]
            loss = -np.sum(np.log(A3[range(num_examples), y_batch])) / num_examples

            # Backward propagation
            dZ3 = A3
            dZ3[range(num_examples), y_batch] -= 1
            dZ3 /= num_examples

            dW3 = np.dot(A2.T, dZ3)
            db3 = np.sum(dZ3, axis=0, keepdims=True)

            dA2 = np.dot(dZ3, W3.T)
            dZ2 = dA2 * activation(Z2, derivative=True)
            dW2 = np.dot(A1.T, dZ2)
            db2 = np.sum(dZ2, axis=0, keepdims=True)

            dA1 = np.dot(dZ2, W2.T)
            dZ1 = dA1 * activation(Z1, derivative=True)
            dW1 = np.dot(X_batch.T, dZ1)
            db1 = np.sum(dZ1, axis=0, keepdims=True)

            # Update parameters
            W3 -= learning_rate * dW3
            b3 -= learning_rate * db3
            W2 -= learning_rate * dW2
            b2 -= learning_rate * db2
            W1 -= learning_rate * dW1
            b1 -= learning_rate * db1
    # Forward pass on test set
    Z1 = np.dot(X_test, W1) + b1
    A1 = activation(Z1)
    Z2 = np.dot(A1, W2) + b2
    A2 = activation(Z2)
    Z3 = np.dot(A2, W3) + b3
    A3 = np.exp(Z3) / np.sum(np.exp(Z3), axis=1, keepdims=True)

    # Predict labels for test set
    y_pred = np.argmax(A3, axis=1)

    # Calculate accuracy
    accuracy = np.mean(y_pred == y_test)

    print(f"Accuracy with activation='{activation.__name__}': {accuracy}")


Accuracy with activation='relu': 0.6721428571428572
Accuracy with activation='sigmoid': 0.37154761904761907


In [28]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_predictions = knn.predict(X_test)

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
rf_predictions = rf.predict(X_test)

print("KNN Classification Report:")
print(classification_report(y_test, knn_predictions))

print("Random Forest Classification Report:")
print(classification_report(y_test, rf_predictions))

# Neural Network
activation_functions = [relu, sigmoid]  # Test two activation functions: ReLU and sigmoid

for activation in activation_functions:
    # Training and evaluation code here...

    # Forward pass on test set
    Z1 = np.dot(X_test, W1) + b1
    A1 = activation(Z1)
    Z2 = np.dot(A1, W2) + b2
    A2 = activation(Z2)
    Z3 = np.dot(A2, W3) + b3
    A3 = np.exp(Z3) / np.sum(np.exp(Z3), axis=1, keepdims=True)

    # Predict labels for test set
    nn_predictions = np.argmax(A3, axis=1)
    nn_report = classification_report(y_test, nn_predictions)

    print(f"Neural Network Classification Report with activation='{activation.__name__}':")
    print(nn_report)

KNN Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.88      0.86       861
           1       0.78      0.96      0.86       971
           2       0.64      0.64      0.64       817
           3       0.55      0.54      0.54       834
           4       0.63      0.55      0.59       802
           5       0.62      0.56      0.59       744
           6       0.85      0.88      0.86       821
           7       0.47      0.55      0.51       914
           8       0.59      0.47      0.52       789
           9       0.47      0.41      0.44       847

    accuracy                           0.65      8400
   macro avg       0.64      0.64      0.64      8400
weighted avg       0.64      0.65      0.64      8400

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.90      0.88       861
           1       0.85      0.95      0.90       971
           2  

  A3 = np.exp(Z3) / np.sum(np.exp(Z3), axis=1, keepdims=True)
  A3 = np.exp(Z3) / np.sum(np.exp(Z3), axis=1, keepdims=True)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
