In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Define paths
csv_path = '/content/drive/MyDrive/Lectures/fairface_filtered_10000.csv'
images_folder = '/content/drive/MyDrive/Lectures/Train select/'


In [None]:
import pandas as pd

# Load the CSV file
data = pd.read_csv(csv_path)

# Inspect the first few rows and column names
print("Columns in CSV:", data.columns)
print(data.head())


Columns in CSV: Index(['file', 'age', 'gender', 'race', 'service_test'], dtype='object')
    file  age  gender  race  service_test
0  27327    1       1     2         False
1  63584    0       0     1         False
2  35860    6       0     4          True
3  83767    4       1     2          True
4  66866    1       0     3         False


In [None]:
# Convert the 'file' column to strings
data['file'] = data['file'].astype(str)

# Remove extra spaces if any
data['file'] = data['file'].str.strip()

# Correct file names by replacing '.jpg.jpg' with '.jpg'
data['file'] = data['file'].str.replace('.jpg.jpg', '.jpg', regex=False)

# Display a few updated file names
print("Updated file names in CSV:")
print(data['file'].head())


Updated file names in CSV:
0    27327
1    63584
2    35860
3    83767
4    66866
Name: file, dtype: object


In [None]:
import os

# List the first few files in the folder
image_files = os.listdir(images_folder)
print(f"Sample files in folder: {image_files[:10]}")
print(f"Number of files in folder: {len(image_files)}")


Sample files in folder: ['43148.jpg', '47963.jpg', '61664.jpg', '74849.jpg', '79333.jpg', '71931.jpg', '16303.jpg', '15517.jpg', '30797.jpg', '77430.jpg']
Number of files in folder: 8652


In [None]:
# Check if each file in the CSV exists in the folder
data['exists_in_folder'] = data['file'].apply(lambda x: os.path.exists(os.path.join(images_folder, x)))

# Display the count of existing and missing files
print(data['exists_in_folder'].value_counts())


exists_in_folder
False    8781
Name: count, dtype: int64


In [None]:
# Extract unique file names from the CSV
csv_files = data['file'].unique()
print(f"Sample file names in CSV: {csv_files[:10]}")
print(f"Total unique files in CSV: {len(csv_files)}")


Sample file names in CSV: ['27327' '63584' '35860' '83767' '66866' '46466' '77714' '19719' '81363'
 '82832']
Total unique files in CSV: 8781


In [None]:
# Extract unique file names from the folder
folder_files = os.listdir(images_folder)
print(f"Sample file names in folder: {folder_files[:10]}")
print(f"Total files in folder: {len(folder_files)}")


Sample file names in folder: ['43148.jpg', '47963.jpg', '61664.jpg', '74849.jpg', '79333.jpg', '71931.jpg', '16303.jpg', '15517.jpg', '30797.jpg', '77430.jpg']
Total files in folder: 8652


In [None]:
# Add the '.jpg' extension to the file names in the CSV
data['file'] = data['file'].astype(str) + '.jpg'

# Display a sample of the updated file names
print(data['file'].head())


0    27327.jpg
1    63584.jpg
2    35860.jpg
3    83767.jpg
4    66866.jpg
Name: file, dtype: object


In [None]:
# Recheck if files exist in the folder
data['exists_in_folder'] = data['file'].apply(lambda x: os.path.exists(os.path.join(images_folder, x)))

# Display the count of existing and missing files
print(data['exists_in_folder'].value_counts())


exists_in_folder
True     7686
False    1095
Name: count, dtype: int64


In [None]:
# Filter rows for missing files
missing_files = data[~data['exists_in_folder']]

# Save the missing file names to a CSV
missing_files[['file']].to_csv('/content/drive/MyDrive/Lectures/missing_files_log.csv', index=False)
print("Missing files logged to 'missing_files_log.csv'")


Missing files logged to 'missing_files_log.csv'


In [None]:
# Filter valid rows
valid_data = data[data['exists_in_folder']].reset_index(drop=True)

# Display the number of valid rows and a sample
print(f"Number of valid rows: {len(valid_data)}")
print(valid_data.head())


Number of valid rows: 7686
        file  age  gender  race  service_test  exists_in_folder
0  27327.jpg    1       1     2         False              True
1  63584.jpg    0       0     1         False              True
2  35860.jpg    6       0     4          True              True
3  83767.jpg    4       1     2          True              True
4  66866.jpg    1       0     3         False              True


In [None]:
# Add full file paths
valid_data['file_path'] = valid_data['file'].apply(lambda x: os.path.join(images_folder, x))

# Display a sample of the updated dataset
print(valid_data[['file_path', 'age', 'gender', 'race']].head())


                                           file_path  age  gender  race
0  /content/drive/MyDrive/Lectures/Train select/2...    1       1     2
1  /content/drive/MyDrive/Lectures/Train select/6...    0       0     1
2  /content/drive/MyDrive/Lectures/Train select/3...    6       0     4
3  /content/drive/MyDrive/Lectures/Train select/8...    4       1     2
4  /content/drive/MyDrive/Lectures/Train select/6...    1       0     3


In [None]:
import tensorflow as tf

# Define image paths and labels
image_paths = valid_data['file_path'].values
labels = valid_data[['age', 'gender', 'race']].values

# Function to load and preprocess images
def load_image(file_path, label):
    img = tf.io.read_file(file_path)
    img = tf.image.decode_jpeg(img, channels=3)  # Decode as RGB
    img = tf.image.resize(img, (224, 224))  # Resize to match EfficientNet input
    img = img / 255.0  # Normalize to [0, 1]
    return img, label

# Create a TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))
dataset = dataset.map(load_image).batch(32).prefetch(tf.data.AUTOTUNE)


In [None]:
from sklearn.model_selection import train_test_split

# Split image paths and labels
train_paths, val_paths, train_labels, val_labels = train_test_split(
    image_paths, labels, test_size=0.2, random_state=42
)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_paths, train_labels))
train_dataset = train_dataset.map(load_image).batch(32).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices((val_paths, val_labels))
val_dataset = val_dataset.map(load_image).batch(32).prefetch(tf.data.AUTOTUNE)


In [None]:
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras import layers, models

# Define the base model with pre-trained weights
base_model = EfficientNetB0(include_top=False, input_shape=(224, 224, 3), pooling='avg', weights='imagenet')

# Freeze the base model to retain pre-trained weights during initial training
base_model.trainable = False

# Build the full model
model = models.Sequential([
    base_model,
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(3, activation='softmax')  # Adjust for your labels (age, gender, race)
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Display the model summary
model.summary()


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 efficientnetb0 (Functional  (None, 1280)              4049571   
 )                                                               
                                                                 
 dense (Dense)               (None, 256)               327936    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 3)                 771       
                                                                 
Total params: 4378278 (16.70 MB)
Trainable params: 328707 (1.25 MB)
Non-trainable params: 4049571 (15.45 MB)
_________________________________________________

In [None]:
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.applications import EfficientNetB0

# Define the input
inputs = Input(shape=(224, 224, 3))

# Base model (EfficientNetB0)
base_model = EfficientNetB0(include_top=False, input_shape=(224, 224, 3), pooling='avg', weights='imagenet')(inputs)

# Age group output
age_output = Dense(10, activation='softmax', name='age_output')(base_model)  # Adjust for 10 age groups

# Gender output
gender_output = Dense(2, activation='softmax', name='gender_output')(base_model)  # Binary classification

# Race output
race_output = Dense(7, activation='softmax', name='race_output')(base_model)  # Adjust for 7 race categories

# Create the multi-output model
model = Model(inputs=inputs, outputs=[age_output, gender_output, race_output])

# Compile the model
model.compile(
    optimizer='adam',
    loss={
        'age_output': 'categorical_crossentropy',
        'gender_output': 'categorical_crossentropy',
        'race_output': 'categorical_crossentropy',
    },
    metrics={
        'age_output': 'accuracy',
        'gender_output': 'accuracy',
        'race_output': 'accuracy',
    }
)

# Display the model summary
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 efficientnetb0 (Functional  (None, 1280)                 4049571   ['input_2[0][0]']             
 )                                                                                                
                                                                                                  
 age_output (Dense)          (None, 10)                   12810     ['efficientnetb0[0][0]']      
                                                                                                  
 gender_output (Dense)       (None, 2)                    2562      ['efficientnetb0[0][0]']  

In [None]:
from tensorflow.keras.utils import to_categorical

# One-hot encode labels
age_labels = to_categorical(valid_data['age'], num_classes=10)      # Adjust for the number of age groups
gender_labels = to_categorical(valid_data['gender'], num_classes=2) # Binary classification for gender
race_labels = to_categorical(valid_data['race'], num_classes=7)     # Adjust for the number of race categories

# Combine labels into a list
labels = [age_labels, gender_labels, race_labels]


In [None]:
# Pair image paths with corresponding labels
paired_labels = list(zip(age_labels, gender_labels, race_labels))


In [None]:
from sklearn.model_selection import train_test_split

# Split image paths and paired labels
train_paths, val_paths, train_labels, val_labels = train_test_split(
    image_paths, paired_labels, test_size=0.2, random_state=42
)


In [None]:
# Unzip the labels for training and validation
train_age_labels, train_gender_labels, train_race_labels = zip(*train_labels)
val_age_labels, val_gender_labels, val_race_labels = zip(*val_labels)

# Convert labels to arrays
train_labels = [list(train_age_labels), list(train_gender_labels), list(train_race_labels)]
val_labels = [list(val_age_labels), list(val_gender_labels), list(val_race_labels)]


In [None]:
# Function to load and preprocess images
def load_image(file_path, labels):
    img = tf.io.read_file(file_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (224, 224))  # Resize to match EfficientNet input
    img = img / 255.0  # Normalize to [0, 1]
    return img, labels

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_paths, tuple(train_labels)))
train_dataset = train_dataset.map(load_image).batch(32).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices((val_paths, tuple(val_labels)))
val_dataset = val_dataset.map(load_image).batch(32).prefetch(tf.data.AUTOTUNE)


In [None]:
# Train the model
history = model.fit(
    train_dataset,
    epochs=40,
    validation_data=val_dataset
)


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40

KeyboardInterrupt: 

In [None]:
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 efficientnetb0 (Functional  (None, 1280)                 4049571   ['input_2[0][0]']             
 )                                                                                                
                                                                                                  
 age_output (Dense)          (None, 10)                   12810     ['efficientnetb0[0][0]']      
                                                                                                  
 gender_output (Dense)       (None, 2)                    2562      ['efficientnetb0[0][0]']  

In [None]:
model.save('/content/drive/MyDrive/Lectures/interrupted_model.h5')
print("Model saved successfully!")


  saving_api.save_model(


Model saved successfully!


In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss={
        'age_output': 'categorical_crossentropy',
        'gender_output': 'categorical_crossentropy',
        'race_output': 'categorical_crossentropy',
    },
    metrics={
        'age_output': 'accuracy',
        'gender_output': 'accuracy',
        'race_output': 'accuracy',
    }
)


In [None]:
for layer in model.layers:
    layer.trainable = True


In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss={
        'age_output': 'categorical_crossentropy',
        'gender_output': 'categorical_crossentropy',
        'race_output': 'categorical_crossentropy',
    },
    metrics={
        'age_output': 'accuracy',
        'gender_output': 'accuracy',
        'race_output': 'accuracy',
    }
)


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3)

# Train the model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10,  # Fewer epochs for fine-tuning
    callbacks=[early_stopping, reduce_lr]
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.save('/content/drive/MyDrive/Lectures/fine_tuned_model_epoch10.h5')
print("Model saved as HDF5 format!")


Model saved as HDF5 format!


In [None]:
import pickle

# Extract model weights and architecture
model_data = {
    'model_config': model.to_json(),  # Save model architecture
    'model_weights': model.get_weights()  # Save model weights
}

# Save the model data to a .pkl file
with open('/content/drive/MyDrive/Lectures/fine_tuned_model_epoch10.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("Model saved as a .pkl file!")


Model saved as a .pkl file!


In [None]:
# Check the distribution of labels in the training data
print(valid_data['age'].value_counts())   # Distribution of age groups
print(valid_data['race'].value_counts())  # Distribution of race classes


age
3    991
4    970
1    968
7    966
5    962
6    956
2    944
0    929
Name: count, dtype: int64
race
1    1120
3    1111
6    1105
5    1105
0    1104
2    1098
4    1043
Name: count, dtype: int64


In [None]:
# Display the first few rows of the dataset
print(valid_data.head())

# Check the unique values for age and race
print("Unique age labels:", valid_data['age'].unique())
print("Unique race labels:", valid_data['race'].unique())


        file  age  gender  race  service_test  exists_in_folder  \
0  27327.jpg    1       1     2         False              True   
1  63584.jpg    0       0     1         False              True   
2  35860.jpg    6       0     4          True              True   
3  83767.jpg    4       1     2          True              True   
4  66866.jpg    1       0     3         False              True   

                                           file_path  
0  /content/drive/MyDrive/Lectures/Train select/2...  
1  /content/drive/MyDrive/Lectures/Train select/6...  
2  /content/drive/MyDrive/Lectures/Train select/3...  
3  /content/drive/MyDrive/Lectures/Train select/8...  
4  /content/drive/MyDrive/Lectures/Train select/6...  
Unique age labels: [1 0 6 4 7 2 5 3]
Unique race labels: [2 1 4 3 0 6 5]
