# ParikhSamuolisReclassificationNN Final Project File 3
## date last modified: Dec 4, 2024
### how to link to github --> https://saturncloud.io/blog/how-to-add-jupyter-notebook-to-github/

# Loading images and necessary functions - run each time

In [None]:
import os
from PIL import Image
import numpy as np
import random
import shutil

# define directories for train and validation sets
root_dir = '/projectnb/ds340/projects/Samuolis_Parikh_Image_Data/'

train_dir = root_dir +"resized_images/train"
validation_dir = root_dir + "resized_images/validation"

train_target = train_dir +"/baldeagle"
train_nontarget = train_dir +"/nonbaldeagle"

val_target = validation_dir +"/baldeagle"
val_nontarget = validation_dir +"/nonbaldeagle"

def load_images_from_folders(folder1, folder2, img_size = (224,224)):
    images = []
    labels = []
    
    # load images from the first folder
    for filename in os.listdir(folder1):
        img_path = os.path.join(folder1, filename)
        try:
            with Image.open(img_path) as img:
                img = img.convert('RGB')
                img = img.resize(img_size)
                images.append(np.array(img))  # convert image to array
                labels.append(1)  # class label for folder1
        except Exception as e:
            print(f"Could not load image {filename} from {folder1}: {e}")

    # load images from the second folder
    for filename in os.listdir(folder2):
        img_path = os.path.join(folder2, filename)
        try:
            with Image.open(img_path) as img:
                img = img.convert('RGB')
                img = img.resize(img_size)
                images.append(np.array(img))
                labels.append(0)  # class label for folder2
        except Exception as e:
            print(f"Could not load image {filename} from {folder2}: {e}")

    # convert lists to NumPy arrays
    
    images = np.array(images)
    labels = np.array(labels)
    
    return images, labels

images_train, label_train = load_images_from_folders(train_target, train_nontarget)
images_val, label_val = load_images_from_folders(val_target, val_nontarget)

In [None]:
## for debugging:
print(images_train.shape, label_train.shape, type(images_train))
print(images_train.min(), images_train.max())  # expected: 0 255, later will normalize
print(f"Initial eagle count: {np.sum(label_train == 1)}")
print(f"Initial noneagle count: {np.sum(label_train == 0)}")

In [None]:
def change_labels(labels, percentage):
    random.seed(340)
    label_one_indices = np.where(labels == 1)[0]
    
    n = int(len(label_one_indices) * (percentage / 100))
    
    indices_to_change = np.random.choice(label_one_indices, size=n, replace=False)
    
    labels[indices_to_change] = 0
    
    return labels, indices_to_change

# for example, change 20% of label 1s to label 0
percentage = 0  
# changed_indices
# label_train, changed_indices = change_labels(label_train, percentage)

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.preprocessing.image import ImageDataGenerator
tf.keras.utils.set_random_seed(340)
tf.config.experimental.enable_op_determinism()

In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
]
epochs = 15
# restore best weights make the model be the one that was the best instead of last one
# patience changed from 4-->3

In [None]:
# reload images
images_train, label_train = load_images_from_folders(train_target, train_nontarget)
percentage = 20  
# changed_indices
label_train, changed_indices = change_labels(label_train, percentage)

In [None]:
## for debugging:
print(f"New eagle count: {np.sum(label_train == 1)}")
print(f"New noneagle count: {np.sum(label_train == 0)}")

In [None]:
from tensorflow.keras.layers import Input, Dropout, Concatenate
confidence_init = confidence_init = np.array([.35 if x<.5 else 1 for x in label_train]).reshape(5200,1)
# start with all 1s for confidence
# this doesn't work --- we don't know before hand which indices we aren't confident about, we especially don't know 
# to specifically be less confident for the labels that we changed
# confidence_init[label_train == 0] = 0  # Set confidence to 0 for original 0 labels
# confidence_init[changed_indices] = 0.35  # Set confidence to 0.35 for flipped labels
# confidence_init = confidence_init.reshape(-1, 1)  # Reshape to (N, 1)

# print data statistics
print(f"New eagle count: {np.sum(label_train == 1)}")
print(f"New noneagle count: {np.sum(label_train == 0)}")
print(f"Confidence values: {confidence_init[:10].flatten()}")

In [None]:
# remake models
# we have full confidence if it is a 1, the lower the number the more confident you are in the 0 class -- .999999 vs .00004

base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

for layer in base_model.layers:
    layer.trainable = True


# add new fully connected layers for binary classification
image_input = base_model.input
x = base_model.output
x = Flatten()(x)

additional_input = Input(shape=(1,), name="additional_input") 
y = Dense(64, activation='relu')(additional_input) 
y = Dropout(0.1)(y) 

combined = Concatenate()([x, y]) # 2 channels
combined = Dense(256, activation='relu')(combined)
combined = Dense(1, activation='sigmoid')(combined) 

model = Model(inputs=[image_input, additional_input], outputs=combined)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'], jit_compile=False)