In [1]:
import os 
os.environ['TFF_CPP_MIN_LOG_LEVEL'] = '2' #this wiol ignore information massages form tensorflow

In [2]:
import tensorflow as tf
from tensorflow import keras #Keras offers a user-friendly, high-level API for building and training neural networks. It allows you to quickly prototype your models without worrying too much about low-level implementation details.
from tensorflow.keras import layers # The layers module in Keras provides a wide range of pre-implemented layers that you can easily stack together to build complex neural network architectures.
from tensorflow.keras.datasets import mnist

In [3]:
(x_train,y_train),(x_test,y_test) = mnist.load_data()
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)

(60000, 28, 28)
(60000,)
(10000, 28, 28)


In [4]:
x_train= x_train.reshape(-1, 28*28).astype("float32")/255.0 

#I need to make my data in 1 dimension and that's why i use t=-1  & that -1 represent the 1d dimension .and devided by 255 for makeing the data 0 & 1
x_test= x_test.reshape(-1, 28*28).astype("float32")/255.0

#alternatives
#x_train=tf.convert_to_tensor(x_train)

In [5]:
model=keras.Sequential(
    [
        keras.Input(shape=(28*28,)),
        layers.Dense(512, activation='relu'),#Dense layers are fully connected layers where each neuron in a layer is connected to every neuron in the previous layer.
        layers.Dense(256, activation='relu'),
        layers.Dense(10),
    ]
)

The ReLU (Rectified Linear Unit) activation function is commonly used in hidden layers of neural networks due to its simplicity and effectiveness.
ReLU replaces negative values with zero and leaves positive values unchanged, introducing non-linearity to the network.
Alternatives to ReLU include sigmoid, tanh, and Leaky ReLU.

The number of neurons in each layer (512 and 256 in this case) is a hyperparameter that you can adjust based on the complexity of the problem and the size of your dataset.
Adding more neurons can increase the capacity of the model to learn complex patterns but may also increase the risk of overfitting, especially if the dataset is small.
Conversely, using fewer neurons can lead to a simpler model with lower capacity but may underfit the data if it's too complex.

Softmax is an activation function commonly used in the output layer of a neural network for multi-class classification problems. It converts the raw output scores (logits) of the network into probabilities that sum up to 1, representing the likelihood of each class.

In [6]:
model.compile( 
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), #Sparse categorical cross-entropy is a loss function commonly used in classification tasks where the classes are mutually exclusive (each input belongs to exactly one class).
    #This loss function is suitable for multi-class classification problems where the labels are integers (not one-hot encoded) and the model's output is raw logits (i.e., unnormalized predictions) rather than probabilities.
    #Setting from_logits=True indicates that the model's output is not softmax normalized, and the loss function should handle the normalization internally.
    
    optimizer=keras.optimizers.Adam(learning_rate=0.001), #during the training process in order to minimize the loss function
    #Adam is an adaptive optimization algorithm that combines the advantages of AdaGrad and RMSProp
    metrics=["accuracy"],
)

ALTERNATIVE OF ADAM(HYPERPERAMETER TUNING)

# ptimizer=keras.optimizer.SGD(learning_rate=0.001,momentum=0.9)
SDG  introduces inertia into the update process, allowing the optimizer to continue moving in the same direction even when the gradient changes direction.

# ptimizer = keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)
RMSprop প্রতিটি প্যারামিটারের জন্য শেখার হারকে অভিযোজিতভাবে সামঞ্জস্য করতে অতীতের বর্গক্ষেত্র গ্রেডিয়েন্টের একটি দ্রুতগতিতে ক্ষয়প্রাপ্ত গড় ব্যবহার করে। সময়ের সাথে এই গড় কত দ্রুত ক্ষয় হয় তা Rho নিয়ন্ত্রণ করে।

# ptimizer = keras.optimizers.Adagrad(learning_rate=0.01)

# ptimizer = keras.optimizers.Adadelta(learning_rate=1.0, rho=0.95)

# ptimizer = keras.optimizers.Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999)
Beta_1 and beta_2 are hyperparameters used in the Adam and Nadam optimizers. They control the exponential decay rates for the first and second moments of the gradients, respectively.

In [7]:
print(model.summary())

None


In [8]:
model.fit(x_train,y_train, batch_size=32, epochs=7, verbose=2)
model.evaluate(x_test,y_test,batch_size=32, verbose=2)

Epoch 1/7
1875/1875 - 5s - 2ms/step - accuracy: 0.9428 - loss: 0.1849
Epoch 2/7
1875/1875 - 4s - 2ms/step - accuracy: 0.9762 - loss: 0.0772
Epoch 3/7
1875/1875 - 4s - 2ms/step - accuracy: 0.9827 - loss: 0.0537
Epoch 4/7
1875/1875 - 4s - 2ms/step - accuracy: 0.9862 - loss: 0.0426
Epoch 5/7
1875/1875 - 4s - 2ms/step - accuracy: 0.9886 - loss: 0.0340
Epoch 6/7
1875/1875 - 4s - 2ms/step - accuracy: 0.9915 - loss: 0.0262
Epoch 7/7
1875/1875 - 4s - 2ms/step - accuracy: 0.9923 - loss: 0.0242
313/313 - 0s - 833us/step - accuracy: 0.9814 - loss: 0.0773


[0.0773410052061081, 0.9814000129699707]

batch_size=32: This parameter specifies the number of samples per gradient update. In each iteration (or batch), the optimizer will process 32 samples and update the model's parameters accordingly.

epochs=5: This parameter specifies the number of times the entire dataset will be passed forward and backward through the model during training. In this case, the model will be trained for 5 epochs.

verbose=2: This parameter controls the verbosity of the training output. Setting verbose=2 means that training progress will be displayed for each epoch, including the loss and any specified metrics.

In [9]:
#Specifing the input
model=keras.Sequential(
    [
        keras.Input(shape=(28*28,)),
        layers.Dense(512, activation='relu'),
        layers.Dense(256, activation='relu'),
        layers.Dense(10),
    ]
)
print(model.summary())

None


Also we can write it like this

"model = keras.Sequential()

model.add(keras.Input(shape=(784)))

model.add(layers.Dense(512, activation="relu"))

model.add(layers.Dense(256, activation="relu", name="my_layer"))

model.add(layers.Dense(10))"

# =================================================================================
import sys

exit()


#The sys.exit() function is used to exit from Python programs. It raises the SystemExit exception, which can be caught and handled if needed, but if not handled, it terminates the program.

# Functional API

In [24]:
inputs=keras.Input(shape=(784,))
x=layers.Dense(512, activation='relu')(inputs)
#x=layers.Dense(512, activation='relu', name='1st layer')(inputs) # we can also name the layers like 1st ,2ed, 3ed etc 
x=layers.Dense(256, activation='relu')(x)
#x=layers.Dense(256, activation='relu',name='2ed layer')(x)
outputs=layers.Dense(10,activation='softmax')(x)
model=keras.Model(inputs=inputs, outputs=outputs)

In [25]:
print(model.summary())

None


In [26]:
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=["accuracy"]
)

In [27]:
model.fit(x_train,y_train, batch_size=1, epochs=5, verbose=2)
model.evaluate(x_test,y_test,batch_size=1, verbose=2)

Epoch 1/5
60000/60000 - 688s - 11ms/step - accuracy: 0.9288 - loss: 0.2668
Epoch 2/5
60000/60000 - 128s - 2ms/step - accuracy: 0.9589 - loss: 0.1864
Epoch 3/5
60000/60000 - 122s - 2ms/step - accuracy: 0.9646 - loss: 0.1623
Epoch 4/5
60000/60000 - 134s - 2ms/step - accuracy: 0.9676 - loss: 0.1475
Epoch 5/5
60000/60000 - 133s - 2ms/step - accuracy: 0.9709 - loss: 0.1419
10000/10000 - 5s - 495us/step - accuracy: 0.9568 - loss: 0.2682


[0.2681621313095093, 0.9567999839782715]

To explicitly specify the number of samples your model trains on in each epoch, you can set the batch size accordingly. For example, if you want to train on all 60,000 samples in each epoch, you can set the batch size to 1 (though this would likely be very slow). If you want a balance between speed and accuracy, you can experiment with different batch sizes.