In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.image import load_img, img_to_array
from keras.layers import Input, Dense, Dropout, Conv2D, MaxPool2D, Flatten
from keras.models import Model
from keras.optimizers import Adadelta, Adam, SGD
from keras.losses import binary_crossentropy
from keras.regularizers import l1
from keras.backend import clear_session
from keras.callbacks import TensorBoard
from keras.initializers import glorot_normal
from keras.preprocessing.image import ImageDataGenerator

In [None]:
df = pd.read_csv(r'C:\Users\marno\ibs\GreenfoxIBS\Deep Learning Home Assignment\Data\groundtruth.tsv', sep='\t')
images = 'C://Users//marno//ibs//GreenfoxIBS//Deep Learning Home Assignment//Data//imgs//'

## The groundtruth.tsv labels

#### The (tab-delimited) groundtruth.tsv file has 4 columns:


* **user_id:** (string) Participant's ID.

* **ad_clicked:** (int) Whether the participant clicked on the ad (1) or not (0).

* **attention:** (int) Self-reported attention score, in 1-5 Likert-type scale (1 denotes no attention).

* **log_id:** (string) Mouse tracking log ID.

In [None]:
df.head()

In [None]:
df.info()

In [None]:
img_files =[]
# make an array of the file names in imgs folder without .png extension
for file in os.listdir(images):
    img_files.append(file[:-4])

# conver df log_id to str
df['log_id'] = df['log_id'].astype(str)

In [None]:
# filter df to contain only the images that are in the imgs folder
df = df[df['log_id'].isin(img_files)]

In [None]:

# Assuming 'images' is the path to the folder containing your images

plt.figure(figsize=(15, 10))
clicked_images = df[df['ad_clicked'] == 1].sample(3, random_state=42)
not_clicked_images = df[df['ad_clicked'] == 0].sample(3, random_state=42)

for i, (index, row) in enumerate(pd.concat([clicked_images, not_clicked_images]).iterrows(), 1):
    plt.subplot(2, 3, i)
    img_path = os.path.join(images, str(row['log_id']) + '.png')
    img = plt.imread(img_path)
    
    plt.imshow(img)
    plt.title(f"Log ID: {row['log_id']}\nAd Clicked: {row['ad_clicked']}", fontsize=14)
    plt.axis('off')

plt.show()

In [None]:
# Convert labels to numpy array
labels = np.array(df['ad_clicked'])

In [None]:
# Add column to df with image paths
df['image_path'] = images + df['log_id'] + '.png'

In [None]:
#df to new csv
df.to_csv('groundtruth_with_images.csv', index=False)      

In [None]:
'''from keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

# Assuming 'df' is your DataFrame with 'image', and 'ad_clicked' columns

# Split the data into training, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# Convert numerical labels to strings
train_df['ad_clicked'] = train_df['ad_clicked'].astype(str)
val_df['ad_clicked'] = val_df['ad_clicked'].astype(str)
test_df['ad_clicked'] = test_df['ad_clicked'].astype(str)


from keras.applications.resnet50 import preprocess_input


datagen = ImageDataGenerator(
    rescale=1./255,
    preprocessing_function=preprocess_input,
    validation_split=0.2,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Flow from DataFrame for training data
train_generator = datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=images,  # Path to the folder containing images
    x_col='image_path',  # Column containing image filenames
    y_col='ad_clicked',  # Column containing labels
    target_size=(224, 224),  # Resize images to (224, 224)
    batch_size=16,
    class_mode='binary',  # 'binary' for binary classification
    shuffle=True,
    seed=42,
    subset='training'  # Specify training subset
)

# Flow from DataFrame for validation data
val_generator = datagen.flow_from_dataframe(
    dataframe=train_df,  # Use the same DataFrame as training, as it uses the 'subset' parameter
    directory=images,
    x_col='image_path',
    y_col='ad_clicked',
    target_size=(224, 224),
    batch_size=16,
    class_mode='binary',
    shuffle=False,
    seed=43,    # Use different seed for validation
    subset='validation'  # Specify validation subset
)

# Flow from DataFrame for testing data
test_generator = datagen.flow_from_dataframe(
    dataframe=test_df,
    directory=images,
    x_col='image_path',
    y_col='ad_clicked',
    target_size=(224, 224),
    batch_size=16,
    class_mode='binary',
    shuffle=False
)
'''

In [None]:
'''from keras.applications import MobileNetV2
from keras.models import Sequential
from keras.layers import Dense, Dropout, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.callbacks import TensorBoard
from keras.preprocessing.image import ImageDataGenerator

# Assuming you have train_generator, val_generator, and test_generator as defined earlier

def create_mobilenet_model():
    base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

    model = Sequential()
    model.add(base_model)
    model.add(GlobalAveragePooling2D())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))

    for layer in base_model.layers:
        layer.trainable = False

    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    return model

mobilenet_model = create_mobilenet_model()

# Create TensorBoard callback
tensorboard = TensorBoard(log_dir='logs')

# Fit the model with TensorBoard callback
history_mobilenet = mobilenet_model.fit(
    x=train_generator,
    steps_per_epoch=len(train_generator),
    epochs=12,
    validation_data=val_generator,
    validation_steps=len(val_generator),
    callbacks=[tensorboard]
)

# Evaluate the model on the test set
test_loss, test_accuracy = mobilenet_model.evaluate_generator(
    generator=test_generator,
    steps=len(test_generator)
)

print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')
'''

In [None]:
from keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

# Assuming 'df' is your DataFrame with 'image', and 'ad_clicked' columns

# Split the data into training and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert numerical labels to strings
train_df['ad_clicked'] = train_df['ad_clicked'].astype(str)
test_df['ad_clicked'] = test_df['ad_clicked'].astype(str)

from keras.applications.resnet50 import preprocess_input

datagen = ImageDataGenerator(
    rescale=1./255,
    preprocessing_function=preprocess_input,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Flow from DataFrame for training data
train_generator = datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=images,  # Path to the folder containing images
    x_col='image_path',  # Column containing image filenames
    y_col='ad_clicked',  # Column containing labels
    target_size=(224, 224),  # Resize images to (224, 224)
    batch_size=16,
    class_mode='binary',  # 'binary' for binary classification
    shuffle=True,
    seed=42
)

# Flow from DataFrame for testing data
test_generator = datagen.flow_from_dataframe(
    dataframe=test_df,
    directory=images,
    x_col='image_path',
    y_col='ad_clicked',
    target_size=(224, 224),
    batch_size=16,
    class_mode='binary',
    shuffle=False
)


In [None]:
# Create keras cnn model with architecture
from keras import layers
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Dropout
from keras.applications.resnet50 import ResNet50
from keras.models import Model
from keras.optimizers import Adam
from keras.losses import binary_crossentropy
from keras.regularizers import l1
from keras.backend import clear_session
from keras.callbacks import TensorBoard
from keras.initializers import glorot_normal
from keras.layers import MaxPooling2D

def create_model():
    model = keras.Sequential()
    # Convolutional layers
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)))
    model.add(MaxPooling2D((2, 2)))

    #model.add(Conv2D(32, (3, 3), activation='relu'))
    #model.add(MaxPooling2D((2, 2)))

    model.add(Conv2D(16, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Dropout(0.3))  # Optional dropout for regularization
    
    # Freeze all except last 2 layers
    for layer in model.layers[:-2]:
        layer.trainable = True
        
    # Flatten layer to transition from convolutional to fully connected layers
    model.add(Flatten())

    # Fully connected layers
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))

    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))

    # Output layer
    model.add(Dense(1, activation='sigmoid'))  # Binary classification: Clicked or Not Clicked

    # Compile model
    
    model.compile(optimizer=Adam(learning_rate=0.001), loss=binary_crossentropy, metrics=['accuracy'])
    
    return model

model = create_model()

# Create TensorBoard callback
tensorboard = TensorBoard(log_dir='logs')

# Fit the model with TensorBoard callback
history = model.fit_generator(
    generator=train_generator,
    steps_per_epoch=len(train_generator),
    epochs=10,
    callbacks=[tensorboard]
)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate_generator(
    generator=test_generator,
    steps=len(test_generator)
)

# predict on test set and print confusion matrix 
from sklearn.metrics import confusion_matrix
y_pred = model.predict_generator(test_generator, steps=len(test_generator))
y_pred = np.round(y_pred)
y_true = test_generator.classes
cm = confusion_matrix(y_true, y_pred)
print(cm)


print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')



In [None]:
from keras.layers import GlobalAveragePooling2D
from keras.applications import ResNet50
from keras.callbacks import TensorBoard


'''def create_resnet_model():
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

    model = keras.Sequential()
    model.add(base_model)
    model.add(GlobalAveragePooling2D())  # Adjust this layer based on your requirements
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    # Freeze the layers of the pre-trained ResNet50
    for layer in base_model.layers:
        layer.trainable = False

    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    return model

resnet_model = create_resnet_model()

# Create TensorBoard callback
tensorboard = TensorBoard(log_dir='logs')
# Fit the model with TensorBoard callback
history_resnet = resnet_model.fit(
    x=train_generator,  # Input data (features and labels) from the generator
    steps_per_epoch=len(train_generator),
    epochs=8,
    validation_data=val_generator,
    validation_steps=len(val_generator),
    callbacks=[tensorboard]
)


# Evaluate the model on the test set
test_loss, test_accuracy = resnet_model.evaluate_generator(
    generator=test_generator,
    steps=len(test_generator)
)

print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')'''

In [None]:
model.summary()

In [None]:
# Show model history
def plot_history(history):
    fig, ax = plt.subplots(1, 2, figsize=(15, 5))

    ax[0].plot(history.history['accuracy'], label='Train Accuracy')
    ax[0].plot(history.history['val_accuracy'], label='Validation Accuracy')
    ax[0].set_title('Accuracy')
    ax[0].set_xlabel('Epoch')
    ax[0].set_ylim([0, 1])  # Set y-axis to start from 0
    ax[0].legend()

    ax[1].plot(history.history['loss'], label='Train Loss')
    ax[1].plot(history.history['val_loss'], label='Validation Loss')
    ax[1].set_title('Loss')
    ax[1].set_xlabel('Epoch')
    ax[1].set_ylim([0, max(history.history['loss'])])  # Set y-axis to start from 0
    ax[1].legend()

    plt.show()

plot_history(model)