In [12]:
#Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import sklearn 
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, f1_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [13]:
# Load dataset from csv file

data = pd.read_csv("letters.csv")

In [14]:
# Understand dataset structure

print(data.shape)
print(data.dtypes)

(42000, 46)
label       int64
pixel43     int64
pixel44     int64
pixel92     int64
pixel124    int64
pixel125    int64
pixel126    int64
pixel127    int64
pixel128    int64
pixel129    int64
pixel130    int64
pixel131    int64
pixel132    int64
pixel133    int64
pixel134    int64
pixel135    int64
pixel136    int64
pixel137    int64
pixel138    int64
pixel146    int64
pixel147    int64
pixel148    int64
pixel149    int64
pixel150    int64
pixel151    int64
pixel152    int64
pixel153    int64
pixel154    int64
pixel155    int64
pixel156    int64
pixel157    int64
pixel158    int64
pixel159    int64
pixel160    int64
pixel327    int64
pixel328    int64
pixel329    int64
pixel351    int64
pixel410    int64
pixel411    int64
pixel412    int64
pixel413    int64
pixel414    int64
pixel415    int64
pixel416    int64
pixel417    int64
dtype: object


In [15]:
# Check for missing values in the dataset
data.isna().sum()

label       0
pixel43     0
pixel44     0
pixel92     0
pixel124    0
pixel125    0
pixel126    0
pixel127    0
pixel128    0
pixel129    0
pixel130    0
pixel131    0
pixel132    0
pixel133    0
pixel134    0
pixel135    0
pixel136    0
pixel137    0
pixel138    0
pixel146    0
pixel147    0
pixel148    0
pixel149    0
pixel150    0
pixel151    0
pixel152    0
pixel153    0
pixel154    0
pixel155    0
pixel156    0
pixel157    0
pixel158    0
pixel159    0
pixel160    0
pixel327    0
pixel328    0
pixel329    0
pixel351    0
pixel410    0
pixel411    0
pixel412    0
pixel413    0
pixel414    0
pixel415    0
pixel416    0
pixel417    0
dtype: int64

In [16]:
# Creating test and train datasets
# Taking 'Total Property Value' as the target variable and 'X' are the features
X = data.drop('label', axis=1)
y = data['label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
# Create KNN classifier
knn = KNeighborsClassifier()

# Performing a grid search to choose the best n-neighbors value
param_grid = {'n_neighbors': [1, 3, 5, 7, 10, 15, 20]}

# Cross-validation
grid_search_cv = GridSearchCV(knn, param_grid, cv=5)

# Fit to training data
grid_search_cv.fit(X_train_scaled, y_train)

# Print the best parameters and corresponding accuracy
print("Best Parameters (Grid Search):", grid_search_cv.best_params_)
print("Best Accuracy (Grid Search):", grid_search_cv.best_score_)

Best Parameters (Grid Search): {'n_neighbors': 10}
Best Accuracy (Grid Search): 0.6467857142857143


In [30]:
# Build a KNN model
knn_model = KNeighborsClassifier(n_neighbors=10)
knn_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_knn = knn_model.predict(X_test_scaled)

In [31]:
# Evaluate KNN model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f'KNN Accuracy: {accuracy_knn}')
print('KNN Classification Report:')
print(classification_report(y_test, y_pred_knn))

KNN Accuracy: 0.6572619047619047
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       861
           1       0.77      0.96      0.85       971
           2       0.66      0.63      0.64       817
           3       0.59      0.54      0.57       834
           4       0.66      0.51      0.58       802
           5       0.66      0.54      0.60       744
           6       0.83      0.85      0.84       821
           7       0.48      0.62      0.54       914
           8       0.62      0.50      0.55       789
           9       0.48      0.46      0.47       847

    accuracy                           0.66      8400
   macro avg       0.66      0.65      0.65      8400
weighted avg       0.66      0.66      0.65      8400



In [33]:
# Build neural network models
activation_functions = ['tanh', 'logistic', 'relu']
for activation_func in activation_functions:
    # Build neural network model
    nn_model = MLPClassifier(hidden_layer_sizes=(100,), activation=activation_func, max_iter=10000)
    nn_model.fit(X_train_scaled, y_train)

    # Make predictions on the test set
    y_pred_nn = nn_model.predict(X_test_scaled)
    
    # Evaluate neural network model
    accuracy_nn = accuracy_score(y_test, y_pred_nn)
    print(f'Neural Network with {activation_func.capitalize()} Activation Accuracy: {accuracy_nn}')
    print(f'Neural Network with {activation_func.capitalize()} Activation Classification Report:')
    print(classification_report(y_test, y_pred_nn))

    

Neural Network with Tanh Activation Accuracy: 0.695
Neural Network with Tanh Activation Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       861
           1       0.86      0.94      0.90       971
           2       0.70      0.65      0.67       817
           3       0.65      0.57      0.61       834
           4       0.78      0.56      0.65       802
           5       0.62      0.67      0.64       744
           6       0.86      0.88      0.87       821
           7       0.51      0.67      0.58       914
           8       0.62      0.59      0.61       789
           9       0.50      0.49      0.50       847

    accuracy                           0.69      8400
   macro avg       0.70      0.69      0.69      8400
weighted avg       0.70      0.69      0.69      8400

Neural Network with Logistic Activation Accuracy: 0.6892857142857143
Neural Network with Logistic Activation Classification Report

In [36]:
# Evaluate models
accuracy_knn = accuracy_score(y_test, y_pred_knn)
accuracy_nn = accuracy_score(y_test, y_pred_nn)

# Classification report metrics
report_knn = classification_report(y_test, y_pred_knn, output_dict=True)
report_nn = classification_report(y_test, y_pred_nn, output_dict=True)

# Create a matrix with results
results_matrix = pd.DataFrame({
    'Model': ['KNN', 'Neural Network'],
    'Accuracy': [accuracy_knn, accuracy_nn],
    'Precision (weighted)': [report_knn['weighted avg']['precision'], report_nn['weighted avg']['precision']],
    'Recall (weighted)': [report_knn['weighted avg']['recall'], report_nn['weighted avg']['recall']],
    'F1-Score (weighted)': [report_knn['weighted avg']['f1-score'], report_nn['weighted avg']['f1-score']]
})

# Display the results matrix
print(results_matrix.transpose())

                             0               1
Model                      KNN  Neural Network
Accuracy              0.657262        0.695119
Precision (weighted)  0.655607        0.698404
Recall (weighted)     0.657262        0.695119
F1-Score (weighted)   0.652008        0.692509
