In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv("Alphabets_data.csv")

# Explore dataset
print("Dataset shape:", df.shape)
print("Columns:", df.columns)
print("Missing values:\n", df.isnull().sum())
print("Class distribution:\n", df.iloc[:, -1].value_counts())  # Assuming last column is the target


Dataset shape: (20000, 17)
Columns: Index(['letter', 'xbox', 'ybox', 'width', 'height', 'onpix', 'xbar', 'ybar',
       'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy2bar', 'xedge', 'xedgey',
       'yedge', 'yedgex'],
      dtype='object')
Missing values:
 letter    0
xbox      0
ybox      0
width     0
height    0
onpix     0
xbar      0
ybar      0
x2bar     0
y2bar     0
xybar     0
x2ybar    0
xy2bar    0
xedge     0
xedgey    0
yedge     0
yedgex    0
dtype: int64
Class distribution:
 yedgex
8     8047
7     3472
9     2358
6     1827
10    1578
5      992
11     868
4      478
12     137
3      130
13      49
2       30
1       17
14      13
15       2
0        2
Name: count, dtype: int64


In [2]:
df

Unnamed: 0,letter,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,D,2,2,3,3,2,7,7,7,6,6,6,4,2,8,3,7
19996,C,7,10,8,8,4,4,8,6,9,12,9,13,2,9,3,7
19997,T,6,9,6,7,5,6,11,3,7,11,9,5,2,12,2,4
19998,S,2,3,4,2,1,8,7,2,6,10,6,8,1,9,5,8


In [4]:
# Separate features and target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Handle missing values
df.dropna(inplace=True)

# Convert categorical features to numeric
X_encoded = pd.get_dummies(X)

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Encode target labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)


In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

# Optional: if you want categorical output for softmax
num_classes = len(np.unique(y_encoded))

# Build basic ANN model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))  # Input layer
model.add(Dense(32, activation='relu'))                                   # Hidden layer
model.add(Dense(num_classes, activation='softmax'))                       # Output layer

# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=1)


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3671 - loss: 1.9462 - val_accuracy: 0.5147 - val_loss: 1.3022
Epoch 2/20
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5440 - loss: 1.2320 - val_accuracy: 0.5469 - val_loss: 1.1669
Epoch 3/20
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5813 - loss: 1.1092 - val_accuracy: 0.5763 - val_loss: 1.0955
Epoch 4/20
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5976 - loss: 1.0342 - val_accuracy: 0.6003 - val_loss: 1.0355
Epoch 5/20
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6255 - loss: 0.9679 - val_accuracy: 0.5881 - val_loss: 1.0179
Epoch 6/20
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6280 - loss: 0.9477 - val_accuracy: 0.6153 - val_loss: 0.9834
Epoch 7/20
[1m400/400[0m [32m━━━━━━━

In [7]:
# DROP missing values first
df.dropna(inplace=True)

# Split features and labels
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Label encode after dropping rows
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Convert categorical features to numeric
X_encoded = pd.get_dummies(X)

# Normalize features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)


In [9]:
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           1       0.40      0.67      0.50         3
           2       0.00      0.00      0.00         6
           3       0.43      0.12      0.18        26
           4       0.54      0.64      0.59        96
           5       0.62      0.41      0.49       198
           6       0.56      0.62      0.59       365
           7       0.57      0.60      0.59       694
           8       0.81      0.80      0.80      1610
           9       0.58      0.55      0.57       472
          10       0.51      0.60      0.55       316
          11       0.61      0.63      0.62       174
          12       0.20      0.15      0.17        27
          13       0.67      0.20      0.31        10
          14       0.00      0.00      0.00         3

    accuracy                           0.66      4000
   macro avg       0.46      0.43      0.43      4000
weighted avg       0.66      0.66      0.66      4000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Function for tuning model
def build_model(neurons=64, activation='relu', learning_rate=0.001):
    model = Sequential()
    model.add(Dense(neurons, activation=activation, input_shape=(X_train.shape[1],)))
    model.add(Dense(neurons // 2, activation=activation))
    model.add(Dense(len(np.unique(y_train)), activation='softmax'))
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


In [13]:
pip install scikeras


Note: you may need to restart the kernel to use updated packages.


In [14]:
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV

# Wrap model for sklearn compatibility
keras_clf = KerasClassifier(model=build_model, verbose=0, epochs=20, batch_size=32)

# Define tuning parameters
param_dist = {
    'model__neurons': [32, 64, 128],
    'model__activation': ['relu', 'tanh'],
    'model__learning_rate': [0.001, 0.01, 0.1]
}

# Randomized search
random_search = RandomizedSearchCV(estimator=keras_clf,
                                   param_distributions=param_dist,
                                   n_iter=5, cv=3, verbose=1, n_jobs=-1)

# Fit on training data
random_search.fit(X_train, y_train)

# Best parameters
print("Best Hyperparameters Found:", random_search.best_params_)


Fitting 3 folds for each of 5 candidates, totalling 15 fits


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Best Hyperparameters Found: {'model__neurons': 128, 'model__learning_rate': 0.001, 'model__activation': 'relu'}


In [15]:
# Best tuned model
best_model = random_search.best_estimator_

# Predict
y_pred_tuned = best_model.predict(X_test)
y_pred_tuned = y_pred_tuned.astype(int)  # just to be safe

# Evaluate
from sklearn.metrics import classification_report, accuracy_score

print("\nEvaluation for Tuned ANN Model:")
print(classification_report(y_test, y_pred_tuned))
print("Tuned Accuracy:", accuracy_score(y_test, y_pred_tuned))



Evaluation for Tuned ANN Model:
              precision    recall  f1-score   support

           1       0.50      0.33      0.40         3
           2       0.33      0.17      0.22         6
           3       0.42      0.42      0.42        26
           4       0.68      0.38      0.48        96
           5       0.50      0.62      0.55       198
           6       0.59      0.51      0.55       365
           7       0.64      0.62      0.63       694
           8       0.81      0.86      0.83      1610
           9       0.67      0.51      0.58       472
          10       0.55      0.71      0.62       316
          11       0.65      0.54      0.59       174
          12       0.29      0.33      0.31        27
          13       0.50      0.40      0.44        10
          14       0.50      0.33      0.40         3

    accuracy                           0.69      4000
   macro avg       0.54      0.48      0.50      4000
weighted avg       0.69      0.69      0.68    

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [17]:
# Basic Model Metrics
acc_basic = accuracy_score(y_test, y_pred)
prec_basic = precision_score(y_test, y_pred, average='weighted', zero_division=0)
rec_basic = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1_basic = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# Tuned Model Metrics
acc_tuned = accuracy_score(y_test, y_pred_tuned)
prec_tuned = precision_score(y_test, y_pred_tuned, average='weighted', zero_division=0)
rec_tuned = recall_score(y_test, y_pred_tuned, average='weighted', zero_division=0)
f1_tuned = f1_score(y_test, y_pred_tuned, average='weighted', zero_division=0)


In [20]:
print(" Model Comparison:")
print(f"{'Metric':<15}{'Basic ANN':<15}{'Tuned ANN'}")
print("-" * 40)
print(f"{'Accuracy':<15}{acc_basic:<15.4f}{acc_tuned:.4f}")
print(f"{'Precision':<15}{prec_basic:<15.4f}{prec_tuned:.4f}")
print(f"{'Recall':<15}{rec_basic:<15.4f}{rec_tuned:.4f}")
print(f"{'F1-Score':<15}{f1_basic:<15.4f}{f1_tuned:.4f}")


 Model Comparison:
Metric         Basic ANN      Tuned ANN
----------------------------------------
Accuracy       0.6605         0.6883
Precision      0.6619         0.6878
Recall         0.6605         0.6883
F1-Score       0.6585         0.6837


ANN effectively classified the alphabet data with decent accuracy.

Data preprocessing (encoding + scaling) was essential.

Basic ANN achieved reasonable performance.

After tuning hyperparameters like neurons, activation functions, and learning rate, accuracy improved significantly.

RandomizedSearchCV was efficient for trying multiple configurations.

