# ParikhSamuolisReclassificationNN Final Project File 3
## date last modified: December 7, 2024
### how to link to github --> https://saturncloud.io/blog/how-to-add-jupyter-notebook-to-github/

# Loading images and necessary functions - run each time

In [1]:
import os
from PIL import Image
import numpy as np
import random
import shutil

# define directories for train and validation sets
root_dir = '/projectnb/ds340/projects/Samuolis_Parikh_Image_Data/'

train_dir = root_dir +"resized_images/train"
validation_dir = root_dir + "resized_images/validation"

train_target = train_dir +"/baldeagle"
train_nontarget = train_dir +"/nonbaldeagle"

val_target = validation_dir +"/baldeagle"
val_nontarget = validation_dir +"/nonbaldeagle"

def load_images_from_folders(folder1, folder2, img_size = (224,224)):
    images = []
    labels = []
    
    # load images from the first folder
    for filename in os.listdir(folder1):
        img_path = os.path.join(folder1, filename)
        try:
            with Image.open(img_path) as img:
                img = img.convert('RGB')
                img = img.resize(img_size)
                images.append(np.array(img))  # convert image to array
                labels.append(1)  # class label for folder1
        except Exception as e:
            print(f"Could not load image {filename} from {folder1}: {e}")

    # load images from the second folder
    for filename in os.listdir(folder2):
        img_path = os.path.join(folder2, filename)
        try:
            with Image.open(img_path) as img:
                img = img.convert('RGB')
                img = img.resize(img_size)
                images.append(np.array(img))
                labels.append(0)  # class label for folder2
        except Exception as e:
            print(f"Could not load image {filename} from {folder2}: {e}")

    # convert lists to NumPy arrays
    
    images = np.array(images)
    labels = np.array(labels)
    
    return images, labels

images_train, label_train = load_images_from_folders(train_target, train_nontarget)
images_val, label_val = load_images_from_folders(val_target, val_nontarget)

In [2]:
## for debugging:
print(images_train.shape, label_train.shape, type(images_train))
print(images_train.min(), images_train.max())  # expected: 0 255, later will normalize
print(f"Initial eagle count: {np.sum(label_train == 1)}")
print(f"Initial noneagle count: {np.sum(label_train == 0)}")

(5200, 224, 224, 3) (5200,) <class 'numpy.ndarray'>
0 255
Initial eagle count: 1300
Initial noneagle count: 3900


In [3]:
def change_labels(labels, percentage):
    random.seed(340)
    label_one_indices = np.where(labels == 1)[0]
    
    n = int(len(label_one_indices) * (percentage / 100))
    
    indices_to_change = np.random.choice(label_one_indices, size=n, replace=False)
    
    labels[indices_to_change] = 0
    
    return labels, indices_to_change

# for example, change 20% of label 1s to label 0
percentage = 0  
# changed_indices
# label_train, changed_indices = change_labels(label_train, percentage)

In [4]:
import os
import warnings
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt

# suppress TensorFlow and CUDA logs
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # show only errors
warnings.filterwarnings('ignore', category=UserWarning)

# configure Absl logging
import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)

# set deterministic operations and random seed
tf.keras.utils.set_random_seed(340)
tf.config.experimental.enable_op_determinism()

2024-12-04 19:20:09.920211: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-04 19:20:10.837628: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733358011.321330 2154163 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733358011.383307 2154163 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-04 19:20:12.135864: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [5]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
]
epochs = 15
# restore best weights make the model be the one that was the best instead of last one
# patience changed from 4-->3

In [6]:
# reload images
images_train, label_train = load_images_from_folders(train_target, train_nontarget)
percentage = 20  
# changed_indices
label_train, changed_indices = change_labels(label_train, percentage)

In [7]:
## for debugging:
print(f"New eagle count: {np.sum(label_train == 1)}")
print(f"New noneagle count: {np.sum(label_train == 0)}")

New eagle count: 1040
New noneagle count: 4160


In [8]:
from tensorflow.keras.layers import Input, Dropout, Concatenate
confidence_init = confidence_init = np.array([.35 if x<.5 else 1 for x in label_train]).reshape(5200,1)
# start with all 1s for confidence
# this doesn't work --- we don't know before hand which indices we aren't confident about, we especially don't know 
# to specifically be less confident for the labels that we changed
# confidence_init[label_train == 0] = 0  # Set confidence to 0 for original 0 labels
# confidence_init[changed_indices] = 0.35  # Set confidence to 0.35 for flipped labels
# confidence_init = confidence_init.reshape(-1, 1)  # Reshape to (N, 1)

# print data statistics
print(f"New eagle count: {np.sum(label_train == 1)}")
print(f"New noneagle count: {np.sum(label_train == 0)}")
print(f"Confidence values: {confidence_init[:10].flatten()}")

New eagle count: 1040
New noneagle count: 4160
Confidence values: [0.35 1.   1.   1.   1.   1.   1.   0.35 1.   1.  ]


In [9]:
# remake models
# we have full confidence if it is a 1, the lower the number the more confident you are in the 0 class -- .999999 vs .00004

base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

for layer in base_model.layers:
    layer.trainable = True


# add new fully connected layers for binary classification
image_input = base_model.input
x = base_model.output
x = Flatten()(x)

additional_input = Input(shape=(1,), name="additional_input") 
y = Dense(64, activation='relu')(additional_input) 
y = Dropout(0.1)(y) 

combined = Concatenate()([x, y]) # 2 channels
combined = Dense(256, activation='relu')(combined)
combined = Dense(1, activation='sigmoid')(combined) 

model = Model(inputs=[image_input, additional_input], outputs=combined)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'], jit_compile=False)

I0000 00:00:1733358036.672705 2154163 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14784 MB memory:  -> device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:af:00.0, compute capability: 7.0


In [12]:
iteration_list = [] # update done in code
adjusted_indices_list = [] # update done in code
avg_confidences_list = [] # update done in code
totalnumeagles_modelbelievesnoteagles = [] # update done in code
totalnumimages_modelbelievesnoteagles = [] # update done in code
change_ratio_list = [] # update done in code
more_confident_list = [] # update done in code


n_percentage = 5  # % of least confident eagle predictions to adjust
max_iterations = 10 
convergence_tolerance = 0  # stop if change ratio is this
high_conf_threshold = 0.8  # threshold for confident eagle flips
confidence_init = confidence_init = np.array([.35 if x<.5 else 1 for x in label_train]).reshape(5200,1)

num_incorrectly_modified = 0
num_modified = 0
for iteration in range(max_iterations):
    print(f"Iteration {iteration + 1}...")
    iteration_list.append(iteration+1) # updating the lists

    # step 1: train model
    history = model.fit(
        [images_train, confidence_init],
        label_train,
        batch_size=100, # ok this might seem crazy but im wondering if w batch=32 it wasn't encountering enough wrong labels 
        epochs=1,
        # validation_data=([images_val, np.ones((len(images_val), 1))], label_val), # this was a line from chat, replaced w ours instead below
        validation_data=([images_val, label_val.reshape(-1,1)], label_val),
        callbacks=callbacks,
        shuffle = True,
        verbose=0
    )

    # step 2: predict probabilities
    preds = model.predict([images_train, confidence_init]).flatten()

    # step 3: identify least confident eagle predictions
    low_confidence_indices = np.where((label_train == 0) & (preds < 0.5) & (preds != 0))[0] # grabbing indices where label_train is 0 (noneagle), focusing in on the misclassified
    filtered_indices = low_confidence_indices[confidence_init.flatten()[low_confidence_indices]!= 0]
    sorted_indices = filtered_indices[np.argsort(preds[filtered_indices])] # sorts the preds low to high
    to_adjust = sorted_indices[:int(len(sorted_indices) * (n_percentage / 100))] # only grabbing 5% rn of the bottom
    

    # step 4: update confidence for least confident predictions
    if len(to_adjust) > 0:
        confidence_init[to_adjust] = 0  # reduce confidence to 0 for the indices we picked by %
        avg_confidence = np.mean(preds[to_adjust])
        print(f"Adjusted {len(to_adjust)} indices, avg confidence: {avg_confidence:.4f}")
        adjusted_indices_list.append(len(to_adjust)) # updating the lists
        avg_confidences_list.append(round(avg_confidence,4)) # updating the lists
        
    else:
        print("No indices to adjust in this iteration.")
        
    print(f"Changed Indices: {sorted(to_adjust)}")
    print(f"Predictions for Changed Indices: {preds[to_adjust]}")
    wrongly_switched = [x for x in changed_indices if x in to_adjust]
    print("FOR CHECKING -- THE EAGLES WHICH THE MODEL GOT EVEN MORE CONFIDENT WASN'T EAGLES:", wrongly_switched)
    more_confident_list.append(len(wrongly_switched)) # updating the lists
    num_incorrectly_modified += len(wrongly_switched)
    num_modified += len(to_adjust)
    print("Total Number of Eagles the model believes are not eagles", num_incorrectly_modified)
    totalnumeagles_modelbelievesnoteagles.append(num_incorrectly_modified) # updating the lists
    print("Total Number of images the model believes are not eagles", num_modified)
    totalnumimages_modelbelievesnoteagles.append(num_modified) # updating the lists
 

Iteration 1...
[1m  5/163[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6s[0m 41ms/step 

2024-12-04 19:42:17.564870: E tensorflow/core/framework/node_def_util.cc:676] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_15}}


[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 42ms/step
Adjusted 208 indices, avg confidence: 0.0000
Changed Indices: [np.int64(578), np.int64(731), np.int64(1344), np.int64(1349), np.int64(1355), np.int64(1371), np.int64(1372), np.int64(1383), np.int64(1412), np.int64(1429), np.int64(1440), np.int64(1446), np.int64(1454), np.int64(1463), np.int64(1468), np.int64(1488), np.int64(1493), np.int64(1526), np.int64(1528), np.int64(1534), np.int64(1552), np.int64(1565), np.int64(1640), np.int64(1668), np.int64(1676), np.int64(1681), np.int64(1682), np.int64(1723), np.int64(1785), np.int64(1797), np.int64(1800), np.int64(1802), np.int64(1812), np.int64(1821), np.int64(1825), np.int64(1895), np.int64(1922), np.int64(1953), np.int64(1977), np.int64(1982), np.int64(1998), np.int64(2014), np.int64(2108), np.int64(2114), np.int64(2119), np.int64(2122), np.int64(2128), np.int64(2148), np.int64(2151), np.int64(2154), np.int64(2156), np.int64(2173), np.int64(2185), np.int64(2198),

2024-12-04 19:42:49.173021: E tensorflow/core/framework/node_def_util.cc:676] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_15}}


[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 42ms/step
Adjusted 197 indices, avg confidence: 0.0001
Changed Indices: [np.int64(530), np.int64(684), np.int64(1302), np.int64(1314), np.int64(1322), np.int64(1325), np.int64(1329), np.int64(1334), np.int64(1376), np.int64(1400), np.int64(1425), np.int64(1476), np.int64(1478), np.int64(1484), np.int64(1507), np.int64(1545), np.int64(1569), np.int64(1570), np.int64(1575), np.int64(1596), np.int64(1622), np.int64(1660), np.int64(1666), np.int64(1688), np.int64(1719), np.int64(1778), np.int64(1837), np.int64(1842), np.int64(1858), np.int64(1859), np.int64(1862), np.int64(1873), np.int64(1928), np.int64(1932), np.int64(1981), np.int64(1984), np.int64(1999), np.int64(2036), np.int64(2061), np.int64(2066), np.int64(2067), np.int64(2073), np.int64(2079), np.int64(2085), np.int64(2095), np.int64(2103), np.int64(2107), np.int64(2144), np.int64(2157), np.int64(2179), np.int64(2209), np.int64(2306), np.int64(2357), np.int64(2369),

2024-12-04 19:43:20.822306: E tensorflow/core/framework/node_def_util.cc:676] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_15}}


[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 42ms/step
Adjusted 187 indices, avg confidence: 0.0002
Changed Indices: [np.int64(533), np.int64(564), np.int64(1265), np.int64(1301), np.int64(1307), np.int64(1332), np.int64(1381), np.int64(1388), np.int64(1414), np.int64(1441), np.int64(1455), np.int64(1590), np.int64(1599), np.int64(1606), np.int64(1645), np.int64(1665), np.int64(1689), np.int64(1695), np.int64(1712), np.int64(1739), np.int64(1747), np.int64(1765), np.int64(1776), np.int64(1795), np.int64(1889), np.int64(1921), np.int64(1931), np.int64(1950), np.int64(1987), np.int64(2037), np.int64(2045), np.int64(2072), np.int64(2083), np.int64(2135), np.int64(2150), np.int64(2160), np.int64(2172), np.int64(2201), np.int64(2279), np.int64(2343), np.int64(2355), np.int64(2360), np.int64(2446), np.int64(2464), np.int64(2486), np.int64(2495), np.int64(2496), np.int64(2497), np.int64(2499), np.int64(2555), np.int64(2559), np.int64(2563), np.int64(2571), np.int64(2598),

2024-12-04 19:43:52.504271: E tensorflow/core/framework/node_def_util.cc:676] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_15}}


[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 42ms/step
Adjusted 178 indices, avg confidence: 0.0002
Changed Indices: [np.int64(1306), np.int64(1308), np.int64(1311), np.int64(1378), np.int64(1404), np.int64(1413), np.int64(1432), np.int64(1445), np.int64(1449), np.int64(1472), np.int64(1487), np.int64(1502), np.int64(1513), np.int64(1537), np.int64(1594), np.int64(1601), np.int64(1700), np.int64(1722), np.int64(1730), np.int64(1799), np.int64(1930), np.int64(1939), np.int64(1942), np.int64(1969), np.int64(1971), np.int64(1974), np.int64(1985), np.int64(2132), np.int64(2133), np.int64(2141), np.int64(2145), np.int64(2161), np.int64(2206), np.int64(2268), np.int64(2290), np.int64(2300), np.int64(2307), np.int64(2309), np.int64(2319), np.int64(2333), np.int64(2368), np.int64(2372), np.int64(2378), np.int64(2449), np.int64(2471), np.int64(2472), np.int64(2542), np.int64(2558), np.int64(2569), np.int64(2586), np.int64(2608), np.int64(2623), np.int64(2670), np.int64(2679

2024-12-04 19:44:24.209195: E tensorflow/core/framework/node_def_util.cc:676] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_15}}


[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 42ms/step
Adjusted 169 indices, avg confidence: 0.0002
Changed Indices: [np.int64(106), np.int64(913), np.int64(1380), np.int64(1423), np.int64(1433), np.int64(1442), np.int64(1453), np.int64(1506), np.int64(1518), np.int64(1548), np.int64(1550), np.int64(1567), np.int64(1623), np.int64(1642), np.int64(1662), np.int64(1678), np.int64(1693), np.int64(1705), np.int64(1709), np.int64(1729), np.int64(1744), np.int64(1768), np.int64(1780), np.int64(1786), np.int64(1826), np.int64(1892), np.int64(1910), np.int64(1911), np.int64(1913), np.int64(1927), np.int64(1937), np.int64(1943), np.int64(1957), np.int64(1962), np.int64(1964), np.int64(2006), np.int64(2008), np.int64(2062), np.int64(2075), np.int64(2093), np.int64(2094), np.int64(2109), np.int64(2121), np.int64(2169), np.int64(2183), np.int64(2202), np.int64(2203), np.int64(2210), np.int64(2218), np.int64(2229), np.int64(2272), np.int64(2278), np.int64(2295), np.int64(2338),

2024-12-04 19:44:55.949902: E tensorflow/core/framework/node_def_util.cc:676] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_15}}


[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 42ms/step
Adjusted 161 indices, avg confidence: 0.0002
Changed Indices: [np.int64(329), np.int64(370), np.int64(1330), np.int64(1356), np.int64(1369), np.int64(1397), np.int64(1430), np.int64(1456), np.int64(1470), np.int64(1497), np.int64(1512), np.int64(1541), np.int64(1553), np.int64(1562), np.int64(1568), np.int64(1578), np.int64(1587), np.int64(1611), np.int64(1619), np.int64(1637), np.int64(1684), np.int64(1704), np.int64(1716), np.int64(1754), np.int64(1769), np.int64(1806), np.int64(1822), np.int64(1823), np.int64(1849), np.int64(1870), np.int64(1880), np.int64(1897), np.int64(1899), np.int64(1912), np.int64(2100), np.int64(2120), np.int64(2168), np.int64(2170), np.int64(2180), np.int64(2204), np.int64(2217), np.int64(2224), np.int64(2225), np.int64(2241), np.int64(2264), np.int64(2269), np.int64(2327), np.int64(2387), np.int64(2434), np.int64(2453), np.int64(2463), np.int64(2465), np.int64(2500), np.int64(2522),

2024-12-04 19:45:27.618013: E tensorflow/core/framework/node_def_util.cc:676] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_15}}


[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 42ms/step
Adjusted 153 indices, avg confidence: 0.0001
Changed Indices: [np.int64(91), np.int64(255), np.int64(315), np.int64(432), np.int64(865), np.int64(918), np.int64(1361), np.int64(1363), np.int64(1370), np.int64(1389), np.int64(1415), np.int64(1418), np.int64(1477), np.int64(1625), np.int64(1629), np.int64(1701), np.int64(1706), np.int64(1711), np.int64(1796), np.int64(1828), np.int64(1833), np.int64(1926), np.int64(1946), np.int64(1955), np.int64(1976), np.int64(1994), np.int64(2025), np.int64(2031), np.int64(2104), np.int64(2112), np.int64(2131), np.int64(2189), np.int64(2190), np.int64(2284), np.int64(2291), np.int64(2315), np.int64(2317), np.int64(2332), np.int64(2361), np.int64(2375), np.int64(2389), np.int64(2390), np.int64(2393), np.int64(2406), np.int64(2435), np.int64(2438), np.int64(2503), np.int64(2508), np.int64(2512), np.int64(2523), np.int64(2565), np.int64(2573), np.int64(2595), np.int64(2612), np.i

2024-12-04 19:45:59.310209: E tensorflow/core/framework/node_def_util.cc:676] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_15}}


[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 42ms/step
Adjusted 145 indices, avg confidence: 0.0002
Changed Indices: [np.int64(351), np.int64(677), np.int64(973), np.int64(1083), np.int64(1105), np.int64(1109), np.int64(1490), np.int64(1515), np.int64(1558), np.int64(1576), np.int64(1610), np.int64(1624), np.int64(1669), np.int64(1679), np.int64(1696), np.int64(1731), np.int64(1782), np.int64(1787), np.int64(1832), np.int64(1835), np.int64(1901), np.int64(1902), np.int64(1907), np.int64(1954), np.int64(1960), np.int64(1973), np.int64(2004), np.int64(2015), np.int64(2023), np.int64(2028), np.int64(2081), np.int64(2126), np.int64(2165), np.int64(2208), np.int64(2231), np.int64(2266), np.int64(2296), np.int64(2313), np.int64(2328), np.int64(2334), np.int64(2335), np.int64(2351), np.int64(2353), np.int64(2384), np.int64(2388), np.int64(2399), np.int64(2481), np.int64(2492), np.int64(2506), np.int64(2583), np.int64(2676), np.int64(2691), np.int64(2704), np.int64(2849), 

2024-12-04 19:46:31.022480: E tensorflow/core/framework/node_def_util.cc:676] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_15}}


[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 42ms/step
Adjusted 138 indices, avg confidence: 0.0002
Changed Indices: [np.int64(1324), np.int64(1327), np.int64(1364), np.int64(1373), np.int64(1417), np.int64(1435), np.int64(1437), np.int64(1450), np.int64(1535), np.int64(1555), np.int64(1586), np.int64(1613), np.int64(1627), np.int64(1650), np.int64(1766), np.int64(1838), np.int64(1844), np.int64(1857), np.int64(1876), np.int64(1888), np.int64(1940), np.int64(1949), np.int64(1979), np.int64(1988), np.int64(2000), np.int64(2001), np.int64(2002), np.int64(2046), np.int64(2048), np.int64(2124), np.int64(2163), np.int64(2213), np.int64(2216), np.int64(2220), np.int64(2221), np.int64(2260), np.int64(2262), np.int64(2273), np.int64(2347), np.int64(2456), np.int64(2461), np.int64(2474), np.int64(2477), np.int64(2516), np.int64(2532), np.int64(2575), np.int64(2605), np.int64(2619), np.int64(2640), np.int64(2642), np.int64(2661), np.int64(2669), np.int64(2721), np.int64(2780

2024-12-04 19:47:02.888429: E tensorflow/core/framework/node_def_util.cc:676] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_15}}


[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 42ms/step
Adjusted 131 indices, avg confidence: 0.0003
Changed Indices: [np.int64(803), np.int64(829), np.int64(1305), np.int64(1316), np.int64(1339), np.int64(1342), np.int64(1351), np.int64(1443), np.int64(1448), np.int64(1467), np.int64(1525), np.int64(1533), np.int64(1592), np.int64(1670), np.int64(1675), np.int64(1718), np.int64(1732), np.int64(1740), np.int64(1758), np.int64(1773), np.int64(1789), np.int64(1853), np.int64(1878), np.int64(1882), np.int64(1935), np.int64(2009), np.int64(2038), np.int64(2039), np.int64(2059), np.int64(2077), np.int64(2087), np.int64(2105), np.int64(2106), np.int64(2110), np.int64(2166), np.int64(2178), np.int64(2205), np.int64(2207), np.int64(2219), np.int64(2222), np.int64(2288), np.int64(2304), np.int64(2342), np.int64(2401), np.int64(2454), np.int64(2550), np.int64(2620), np.int64(2625), np.int64(2653), np.int64(2658), np.int64(2698), np.int64(2716), np.int64(2737), np.int64(2834),

In [13]:
def top_n_percent_indices(predictions, n_percent):

    target_indices = np.where(label_train == 0)[0]
    filtered_preds = predictions[target_indices]
    
    # calculate the number of top elements to select
    num_top_elements = int(np.ceil(len(filtered_preds) * n_percent / 100))
    
    # get the indices of the sorted values (descending order)
    sorted_indices = np.argsort(filtered_preds)[::-1]
    
    # select the top n_percent indices
    top_indices = sorted_indices[:num_top_elements]

    top_original_indices = target_indices[top_indices]
    
    return top_original_indices
n_percent = 5
high_confidence_indices = top_n_percent_indices(preds,n_percent)
high_confidence_indices
actually_eagles = [x for x in high_confidence_indices if x in changed_indices]
# print("The least confident not-eagles that are actually eagles:", actually_eagles)
print(f"How many actual eagles in top {n_percent}% is: {len(actually_eagles)}")
print(f"How many total are in the top {n_percent}% is: {len(high_confidence_indices)}")

How many actual eagles in top 5% is: 70
How many total are in the top 5% is: 208


### This is now simulating manually looking through the top n% of "non-eagles" to change their label. In the real application you would have someone actually looking at each of these images and checking if they were eagles, but since in this training we just know the which indices are actually eagles, we're just going to change it automatically. The only assumption is that the human is able to actually correctly find all the eagles when actually looking at them. 

In [14]:
label_train[actually_eagles] = 1

data_gen_args = {
    'rescale': 1.0 / 255.0,  # normalize pixel values
}

train_datagen = ImageDataGenerator(**data_gen_args)
validation_datagen = ImageDataGenerator(**data_gen_args)


validation_generator = validation_datagen.flow(
    x = images_val,
    y = label_val,
    batch_size=32,
)

In [15]:
augmented_data_gen_args = {
    'rescale': 1.0 / 255.0,
    'horizontal_flip': True,  # randomly flip images horizontally
    'rotation_range': 15,     # rotate images up to 15 degrees
    'zoom_range': 0.1         # zoom in or out by up to 10%
}

train_augmented_datagen = ImageDataGenerator(**augmented_data_gen_args)

train_augmented_generator = train_augmented_datagen.flow(
    x=images_train,
    y=label_train,
    batch_size=32,
)
# ight try moderate instead - riya

In [16]:
final_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

for layer in final_model.layers:
    # layer.trainable = True tomas, riya change below
    layer.trainable = False

# add new fully connected layers for binary classification
x = final_model.output
x = Flatten()(x)
x = Dense(256, activation='relu')(x)
x = Dense(1, activation='sigmoid')(x)  # sigmoid for binary 

model = Model(inputs=final_model.input, outputs=x)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'], jit_compile=False)

In [19]:
from sklearn.metrics import confusion_matrix, classification_report

callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
]

print("Simple Augmentation")
print()
history_simple = model.fit(
    train_augmented_generator,
    epochs=epochs,
    validation_data=validation_generator,
    callbacks = callbacks
)

# confusion matrix and classification report
val_preds = (model.predict(images_val) > 0.5).astype(int).flatten()
conf_matrix = confusion_matrix(label_val, val_preds)
class_report = classification_report(label_val, val_preds, target_names=["Not Eagle", "Eagle"])

print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

Simple Augmentation

Epoch 1/15
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 286ms/step - accuracy: 0.7935 - loss: 0.4648 - val_accuracy: 0.7500 - val_loss: 0.4966
Epoch 2/15
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 284ms/step - accuracy: 0.7752 - loss: 0.8281 - val_accuracy: 0.7500 - val_loss: 0.5634
Epoch 3/15
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 284ms/step - accuracy: 0.7886 - loss: 0.5165 - val_accuracy: 0.7500 - val_loss: 0.5668
Epoch 4/15
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 284ms/step - accuracy: 0.7806 - loss: 0.5265 - val_accuracy: 0.7500 - val_loss: 0.5675
Epoch 5/15
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 284ms/step - accuracy: 0.7880 - loss: 0.5167 - val_accuracy: 0.7500 - val_loss: 0.5662
Epoch 6/15
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 284ms/step - accuracy: 0.7889 - loss: 0.5155 - val_accuracy: 0.7500 - val_lo

2024-12-04 20:06:46.306493: E tensorflow/core/framework/node_def_util.cc:676] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_14}}


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step

Confusion Matrix:
 [[150   0]
 [ 50   0]]

Classification Report:
               precision    recall  f1-score   support

   Not Eagle       0.75      1.00      0.86       150
       Eagle       0.00      0.00      0.00        50

    accuracy                           0.75       200
   macro avg       0.38      0.50      0.43       200
weighted avg       0.56      0.75      0.64       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
