In [1]:
!pip install keras



In [2]:
!pip install tensorflow



In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier
from sklearn.compose import ColumnTransformer

# 1. Data Exploration and Preprocessing

In [4]:
df = pd.read_csv("Alphabets_data.csv")

In [5]:
df

Unnamed: 0,letter,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,D,2,2,3,3,2,7,7,7,6,6,6,4,2,8,3,7
19996,C,7,10,8,8,4,4,8,6,9,12,9,13,2,9,3,7
19997,T,6,9,6,7,5,6,11,3,7,11,9,5,2,12,2,4
19998,S,2,3,4,2,1,8,7,2,6,10,6,8,1,9,5,8


In [6]:
df.shape

(20000, 17)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   letter  20000 non-null  object
 1   xbox    20000 non-null  int64 
 2   ybox    20000 non-null  int64 
 3   width   20000 non-null  int64 
 4   height  20000 non-null  int64 
 5   onpix   20000 non-null  int64 
 6   xbar    20000 non-null  int64 
 7   ybar    20000 non-null  int64 
 8   x2bar   20000 non-null  int64 
 9   y2bar   20000 non-null  int64 
 10  xybar   20000 non-null  int64 
 11  x2ybar  20000 non-null  int64 
 12  xy2bar  20000 non-null  int64 
 13  xedge   20000 non-null  int64 
 14  xedgey  20000 non-null  int64 
 15  yedge   20000 non-null  int64 
 16  yedgex  20000 non-null  int64 
dtypes: int64(16), object(1)
memory usage: 2.6+ MB


In [8]:
 df['yedgex'].nunique()

16

In [9]:
target_column_name = 'yedgex'

In [10]:
y = df[target_column_name].squeeze()
X = df.drop(columns=[target_column_name])

In [11]:
X

Unnamed: 0,letter,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,D,2,2,3,3,2,7,7,7,6,6,6,4,2,8,3
19996,C,7,10,8,8,4,4,8,6,9,12,9,13,2,9,3
19997,T,6,9,6,7,5,6,11,3,7,11,9,5,2,12,2
19998,S,2,3,4,2,1,8,7,2,6,10,6,8,1,9,5


In [12]:
y

0         8
1        10
2         9
3         8
4        10
         ..
19995     7
19996     7
19997     4
19998     8
19999     8
Name: yedgex, Length: 20000, dtype: int64

In [13]:
print("Unique values in target column:", y.unique())
print("Number of unique classes (alphabets):", y.nunique())

Unique values in target column: [ 8 10  9  7  6 11  4  5  3 12 13 14  1  2 15  0]
Number of unique classes (alphabets): 16


In [14]:
print(X.isnull().sum()[X.isnull().sum() > 0]) # Show only columns with missing values
print("Missing values in target:", y.isnull().sum())

Series([], dtype: int64)
Missing values in target: 0


In [15]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [16]:
print("\nOriginal target classes:", label_encoder.classes_)
print("Encoded target values (first 5):", y_encoded[:5])


Original target classes: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
Encoded target values (first 5): [ 8 10  9  8 10]


In [17]:
numerical_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

In [18]:
print(f"\nNumerical columns in X: {list(numerical_cols)}")
print(f"Categorical columns in X: {list(categorical_cols)}")


Numerical columns in X: ['xbox', 'ybox', 'width', 'height', 'onpix', 'xbar', 'ybar', 'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy2bar', 'xedge', 'xedgey', 'yedge']
Categorical columns in X: ['letter']


In [19]:
if not categorical_cols.empty:
    print("\nApplying One-Hot Encoding to categorical feature columns.")
    # Use OneHotEncoder for categorical features in X
    # It's better to use ColumnTransformer for robust preprocessing pipelines
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ],
        remainder='passthrough' # Keep other columns (if any) that are not specified
    )
    X_processed = preprocessor.fit_transform(X)
else:
    print("\nNo categorical feature columns found in X. Applying StandardScaler directly.")
    scaler = StandardScaler()
    X_processed = scaler.fit_transform(X)


Applying One-Hot Encoding to categorical feature columns.


# 2. Model Implementation

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.2, random_state=42)

In [21]:
def create_model(hidden_layers=1, neurons_per_layer=64, activation='relu', learning_rate=0.001):
    model = Sequential()
    # Input layer and first hidden layer
    model.add(Dense(neurons_per_layer, input_dim=X_train.shape[1], activation=activation))
    # Additional hidden layers
    for _ in range(hidden_layers - 1):
        model.add(Dense(neurons_per_layer, activation=activation))
    # Output layer (number of units should be equal to the number of unique classes)
    model.add(Dense(y.nunique(), activation='softmax'))

    optimizer = Adam(learning_rate=learning_rate)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [84]:
import warnings
warnings.filterwarnings('ignore')

In [86]:
initial_model = create_model()
initial_model 

<Sequential name=sequential_2, built=True>

In [88]:
initial_model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)

Epoch 1/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.3706 - loss: 1.9065
Epoch 2/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5160 - loss: 1.2997
Epoch 3/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5532 - loss: 1.1906
Epoch 4/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5777 - loss: 1.1186
Epoch 5/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5949 - loss: 1.0697
Epoch 6/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6080 - loss: 1.0433
Epoch 7/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6181 - loss: 1.0048
Epoch 8/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6357 - loss: 0.9643
Epoch 9/10
[1m500/500[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x1f4669852b0>

In [90]:
y_pred_initial_probs = initial_model.predict(X_test)
y_pred_initial_probs


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


array([[4.8229954e-04, 6.2038461e-03, 3.5556657e-03, ..., 6.7603425e-05,
        2.9067305e-05, 1.5353963e-04],
       [1.7229369e-04, 9.4188011e-04, 7.9819569e-03, ..., 5.7276473e-03,
        5.6708427e-03, 1.8570320e-03],
       [5.4621026e-08, 6.8526029e-09, 4.4631663e-09, ..., 1.5102887e-06,
        2.3509708e-07, 5.0299189e-08],
       ...,
       [2.1985888e-06, 4.7988128e-06, 7.2290745e-06, ..., 4.2324793e-04,
        7.2619616e-05, 1.7030819e-05],
       [2.8210797e-04, 1.6135825e-03, 1.6955610e-03, ..., 9.3351863e-04,
        9.1992842e-04, 4.6685772e-04],
       [3.6376016e-06, 1.1889207e-06, 1.0705663e-06, ..., 3.5004597e-08,
        2.0163347e-08, 6.0789608e-07]], dtype=float32)

In [92]:
y_pred_initial = y_pred_initial_probs.argmax(axis=1)
y_pred_initial

array([7, 6, 8, ..., 8, 5, 8], dtype=int64)

In [94]:
keras_model = KerasClassifier(model=create_model, verbose=0)
keras_model 

# 3. Hyperparameter Tuning

In [96]:
param_grid = {
    'model__hidden_layers': [1, 2],
    'model__neurons_per_layer': [32, 64, 128],
    'model__activation': ['relu', 'tanh'],
    'model__learning_rate': [0.001, 0.01],
    'batch_size': [32, 64],
    'epochs': [10, 20]
}

# 4. Evaluation

In [98]:
print("\n--- Evaluation of Initial Model ---")
accuracy_initial = accuracy_score(y_test, y_pred_initial)




--- Evaluation of Initial Model ---


In [100]:
precision_initial = precision_score(y_test, y_pred_initial, average='weighted')


In [102]:
recall_initial = recall_score(y_test, y_pred_initial, average='weighted')


In [104]:
f1_initial = f1_score(y_test, y_pred_initial, average='weighted')


In [106]:
print(f"Initial Model Accuracy: {accuracy_initial:.4f}")
print(f"Initial Model Precision: {precision_initial:.4f}")
print(f"Initial Model Recall: {recall_initial:.4f}")
print(f"Initial Model F1-Score: {f1_initial:.4f}")

Initial Model Accuracy: 0.6295
Initial Model Precision: 0.6132
Initial Model Recall: 0.6295
Initial Model F1-Score: 0.6164
