In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import * 
import matplotlib.pyplot as plt

In [2]:
# Load the OCR letter recognition dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data'
dataset = pd.read_csv(url, header=None)

In [10]:


# Split the dataset into features and labels
X = dataset.iloc[:, 1:].values  #selecting all rows and selecting all columns from index 1
y = dataset.iloc[:, 0].values   #selecting all rows and selecting column with index 0



In [11]:


print(y[0])
print(y[-1])


T
A


In [12]:
# Encode the labels into numeric value
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [14]:
print(y[0])
print(y[-1])

19
0


In [15]:
#splitting dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [16]:
X_train = X_train / 15.0
X_test = X_test / 15.0

In [17]:


#we are using sequential model where layers are stacked one after another, 
#output of previous layer is given to as input to next layer

model = Sequential()
#1st layer is dense layer which consists on 128 neurons, since it is 1st layer we need to define input_shape of our training data
model.add(Dense(128, activation='relu', input_shape=(16,)))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(26, activation='softmax'))  #softmax is used to predict multiclass category outcome



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [18]:
#now we will compile the model

#sparse_categorical_crossentropy (scce) produces a category index of the most likely matching category.
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics =['accuracy'])

In [19]:
#The batch size is a number of samples processed before the model is updated.
#verbose is the choice that how you want to see the output of your Nural Network while it's training. 
#If you set verbose = 0, It will show nothing
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=12, verbose=1)

Epoch 1/50
[1m1334/1334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 762us/step - accuracy: 0.1256 - loss: 3.0022 - val_accuracy: 0.5523 - val_loss: 1.7736
Epoch 2/50
[1m1334/1334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 663us/step - accuracy: 0.3861 - loss: 1.9588 - val_accuracy: 0.6233 - val_loss: 1.3811
Epoch 3/50
[1m1334/1334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 653us/step - accuracy: 0.4808 - loss: 1.6373 - val_accuracy: 0.6780 - val_loss: 1.1961
Epoch 4/50
[1m1334/1334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 644us/step - accuracy: 0.5278 - loss: 1.4996 - val_accuracy: 0.6942 - val_loss: 1.0866
Epoch 5/50
[1m1334/1334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 661us/step - accuracy: 0.5577 - loss: 1.3938 - val_accuracy: 0.7048 - val_loss: 1.0414
Epoch 6/50
[1m1334/1334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 687us/step - accuracy: 0.5684 - loss: 1.3499 - val_accuracy: 0.7260 - val_loss: 0.9698
Epoc

In [20]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Test accuracy:", accuracy)
print("Test loss:", loss)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 476us/step - accuracy: 0.8412 - loss: 0.5596
Test accuracy: 0.8414999842643738
Test loss: 0.552692174911499


In [21]:


model.save('ocr_model.h5')
# Save the trained model





In [22]:
from tensorflow.keras.models import load_model
model = load_model('ocr_model.h5')
# Load the trained model



In [23]:
sample_records = X_test[:1000]
# Select a few records for classification

In [24]:


# Perform classification
predictions = model.predict(sample_records)



[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [25]:
predicted_labels = np.argmax(predictions, axis=1)
predicted_letters = label_encoder.inverse_transform(predicted_labels)
actual_letters = label_encoder.inverse_transform(y_test)

In [26]:


# Calculate accuracy
accuracy = np.sum(predicted_labels == y[:1000]) / len(predicted_labels)



In [27]:


# Print the predicted labels and corresponding actual labels
print("Predicted Labels\tActual Labels")
for i in range(len(predicted_letters)):
    print(f"{predicted_letters[i]}\t\t\t{actual_letters[i]}")



Predicted Labels	Actual Labels
D			D
D			D
V			V
B			B
H			H
N			N
E			E
Q			Q
X			R
N			N
Q			Q
O			O
N			N
D			D
J			I
M			M
U			U
S			S
O			O
A			A
X			X
A			A
K			K
S			S
O			O
V			V
Y			Y
J			J
D			D
V			V
H			D
Y			V
R			K
F			F
N			N
I			I
G			G
T			T
H			H
K			K
J			J
O			O
M			M
T			T
B			B
Q			Q
Z			Z
C			C
D			D
X			X
K			C
G			G
Q			Q
F			F
N			N
Y			Y
Z			Z
K			K
S			C
K			T
M			M
W			V
Q			G
O			M
D			D
T			T
H			H
P			P
N			N
C			C
O			O
X			X
D			X
Q			Q
V			V
F			P
V			V
G			G
W			W
H			H
P			P
O			O
I			I
G			G
X			X
Y			X
Y			P
W			W
O			Q
D			D
B			N
G			Q
T			T
T			T
V			V
Y			Y
T			T
V			V
J			J
U			U
D			D
L			L
M			M
Y			Y
E			E
M			M
A			A
Y			Y
K			K
Y			Y
U			U
U			U
U			U
K			C
Q			Q
I			I
A			A
E			E
D			D
T			T
B			P
Q			Q
Z			Z
G			G
X			X
G			G
F			F
M			M
S			S
E			E
E			E
K			K
Z			Z
X			X
O			O
E			E
U			U
I			I
R			R
O			O
M			M
J			J
F			F
K			K
M			M
N			N
O			T
D			D
H			H
R			R
L			L
Z			Z
F			F
V			V
W			W
E			E
T			T
W			W
Y			Y
Q			Q
L			L
F		