**Importing required libraries**

In [1]:
from datasets import load_dataset
from huggingface_hub import login
from collections import Counter
import pandas as pd 
import traceback
from PIL import Image
import matplotlib.pyplot as plt
from PIL import UnidentifiedImageError
import numpy as np
from transformers import AutoImageProcessor, ResNetModel
from datasets import Dataset, DatasetDict
import ast
import pickle
import json


import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.applications import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.initializers import *
from tensorflow.keras.preprocessing.image import ImageDataGenerator

  from .autonotebook import tqdm as notebook_tqdm
2024-07-14 20:10:37.295599: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-14 20:10:37.430795: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-14 20:10:37.434735: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-12.2/lib64
2024-07-14 20:10:

**Checking GPU**

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

**Loading images for Augmentation**

In [None]:
login('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')
x=load_dataset('OmidAghili/Image_Classification')

**Visualizing some images**

In [None]:
n = 2223
image = x['train'][n]['image']
label = x['train'][n]['label']
print(x['train'].features['label'].names[label])
print(image.size)
image

**Checking the distribution**

In [None]:
label_counts = Counter(x['train']['label'])
label_counts_series = pd.Series(label_counts)
label_counts_series.plot(kind='bar', edgecolor='black')

**Removing invalid images**

In [None]:
BadImages = set()

for i in range(len(x['train'])):
    try:
        x['train'][i]['image']
    except:
        print(i)
        BadImages.add(i)

x['train'] = x['train'].select(
    (
        i for i in range(len(x['train']))
        if i not in BadImages
    )
)


**Cleaning noisy labels**

We used the code below to get "accuracy_score.csv" on kaggle because of GPU limitation on our local machine.

![Sample Image](clean.png)

In [None]:
scores = pd.read_csv("accuracy_scores.csv")
scores

In [None]:
wrongIndexList = []

for i in range(len(x['train'])):
    label = x['train'][i]['label']
    dataset_label = x['train'].features['label'].names[label]
    predicted_label =  ast.literal_eval(scores['0'][i])['label']
    if dataset_label != predicted_label:
        print("predicted wrong", i)
        wrongIndexList.append(i)


In [None]:
len(wrongIndexList)

**Lets check some of the images predicted having wrong labels**

In [None]:
n = wrongIndexList[2889]
image = x['train'][n]['image']
label = x['train'][n]['label']
print("dataset ", x['train'].features['label'].names[label])
print("predicted ", ast.literal_eval(scores['0'][n])['label'])
print(image.size)
image

**Here we remove noisy data and then check the distribution**

In [None]:
x['train'] = x['train'].select(
    (
        i for i in range(len(x['train']))
        if i not in wrongIndexList
    )
)


In [None]:
len(x['train'])
# x.push_to_hub('OmidAghili/food22Cleaned')

with open('clean_dataset.pickle', 'wb') as f:
    pickle.dump(x, f)

In [None]:
with open('clean_dataset.pickle', 'rb') as f:
    loaded_dataset = pickle.load(f)

loaded_dataset['train']

In [None]:
label_counts = Counter(x['train']['label'])
label_counts_series = pd.Series(label_counts)
label_counts_series.plot(kind='bar', edgecolor='black')

**Preprocessing images**

In [None]:
image_processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")

n = len(x['train'])
batch_size = 32
data = []

tf.debugging.set_log_device_placement(True)
with tf.device("/GPU:0"):
    for i in range(0, n, batch_size):
        print(i)
        batch = x['train'][i:i+batch_size]['image']
        processed = image_processor(batch, return_tensors='tf')
        reshaped_images = tf.transpose(processed['pixel_values'], perm=[0, 2, 3, 1])
        batch_labels = x['train'][i:i+batch_size]['label']
        batch_dataset = tf.data.Dataset.from_tensor_slices((reshaped_images, batch_labels))
        data.append(batch_dataset)

data

**Splitting into train and validation + one hot encoding**

In [None]:
full_dataset = data[0].concatenate(data[1])
for dataset in data[2:]:
    full_dataset = full_dataset.concatenate(dataset)

dataset_size = full_dataset.cardinality().numpy()
train_size = int(0.8 * dataset_size)
val_size = dataset_size - train_size
full_dataset = full_dataset.shuffle(buffer_size=dataset_size, reshuffle_each_iteration=False)
train_dataset = full_dataset.take(train_size)
val_dataset = full_dataset.skip(train_size)

def one_hot_encode(image, label):
    label = tf.one_hot(label, depth=22)
    return image, label

train_dataset = train_dataset.map(one_hot_encode)
val_dataset = val_dataset.map(one_hot_encode)

**Data augmentation (if needed)**

In [None]:
)

**Model initialization**

In [None]:
input_shape = (224,224,3)
number_of_classes = 22

base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)

for layer in base_model.layers:
    layer.trainable = True
    
X = base_model.output
X = Flatten()(X)

X = Dense(512, kernel_initializer='he_uniform')(X)
X = Dropout(0.4)(X)
X = BatchNormalization()(X)
X = Activation('relu')(X)

X = Dense(128, kernel_initializer='he_uniform')(X)
X = Dropout(0.4)(X)
X = BatchNormalization()(X)
X = Activation('relu')(X)

X = Dense(16, kernel_initializer='he_uniform')(X)
X = Dropout(0.4)(X)
X = BatchNormalization()(X)
X = Activation('relu')(X)

output = Dense(number_of_classes, activation='softmax')(X)

model1 = Model(inputs=base_model.input, outputs=output)

**Training hyperparameters**

In [None]:
optimizer = Adam(learning_rate=0.00001)
model1.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

n_epoch = 50
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=1, mode='auto', restore_best_weights=True)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1, mode='auto')

**Training the model**

In [None]:
history1 = model1.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=50,
    callbacks=[early_stop, reduce_lr],
    shuffle=True,
    verbose=1,
    use_multiprocessing=True,
    workers=4
)
