In [27]:
# Importing Keras 
from keras.models import Sequential                          # Neural network model as a sequence of layers.
from keras.layers import Conv2D                              # Convolutional layer
from keras.layers import MaxPooling2D                        # Max pooling layer 
from keras.layers import Flatten                             # Layer used to flatten 2D arrays for fully-connected layers.
from keras.layers import Dense                               # This layer adds fully-connected layers to the neural network.
from keras.layers import Dropout                             # This serves to prevent overfitting by dropping out a random set of activations.
from keras.layers import BatchNormalization                  # This is used to normalize the activations of the neurons.
from keras.layers import Activation                          # Layer for activation functions
from keras.callbacks import EarlyStopping, ModelCheckpoint   # Classes used to save weights and stop training when improvements reach a limit
from keras.models import load_model                          # This helps us to load trained models
# Preprocessing layers
from keras.layers import Rescaling                           # This layer rescales pixel values

# Importing TensorFlow
import tensorflow as tf

import pandas as pd
import skimage
import os
from skimage import io
import matplotlib.pyplot as plt
from skimage.color import rgb2gray
from skimage.color import rgb2hsv
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import shutil

In [None]:
### follows this example CNN from scratch notebook 
#https://www.kaggle.com/code/lusfernandotorres/convolutional-neural-network-from-scratch


In [6]:
def balance_dataset(X, y):
    # Separate the data into classes
    class_0_indices = np.where(y == 0)[0]
    class_1_indices = np.where(y == 1)[0]

    # Sample an equal number of samples from each class
    num_samples_per_class = min(len(class_0_indices), len(class_1_indices))

    # Randomly sample indices for each class
    sampled_indices_class_0 = np.random.choice(class_0_indices, num_samples_per_class, replace=False)
    sampled_indices_class_1 = np.random.choice(class_1_indices, num_samples_per_class, replace=False)

    # Combine the sampled indices
    sampled_indices = np.concatenate([sampled_indices_class_0, sampled_indices_class_1])

    # Use the sampled indices to create the balanced training set
    X_balanced = X[sampled_indices]
    y_balanced = y[sampled_indices]
    
    # Check the balance in y_train_balanced
    print(np.bincount(y_balanced))

    return(X_balanced, y_balanced)

In [9]:
image_dir = '/home/smmrrr/Fog_Imaging_Project/sta_221/all_surfline_photos/'

image_summary = pd.read_csv('surfline_photo_data_description.csv').drop(["Unnamed: 0"], axis = 1)

In [10]:
image_summary

Unnamed: 0,photo,Url,Label,LabelConfidence,year,month,day,hour,site,ext,t,tt,time,time_pst
0,2023_05-18.0730_agatebeachor.jpg,AmlDatastore://workspaceblobstore/UI/2023-07-1...,not_foggy,1,2023,5,18,7,agatebeachor,jpg,,,2023-05-18 07:00:00,2023-05-18 05:00:00-07:00
1,2023_05-03.1930_agatebeachor.jpg,AmlDatastore://workspaceblobstore/UI/2023-07-1...,not_foggy,1,2023,5,3,19,agatebeachor,jpg,,,2023-05-03 19:00:00,2023-05-03 17:00:00-07:00
2,2023_05-18.1630_agatebeachor.jpg,AmlDatastore://workspaceblobstore/UI/2023-07-1...,not_foggy,1,2023,5,18,16,agatebeachor,jpg,,,2023-05-18 16:00:00,2023-05-18 14:00:00-07:00
3,2023_05-18.2030_agatebeachor.jpg,AmlDatastore://workspaceblobstore/UI/2023-07-1...,not_foggy,1,2023,5,18,20,agatebeachor,jpg,,,2023-05-18 20:00:00,2023-05-18 18:00:00-07:00
4,2023_05-14.2330_agatebeachor.jpg,AmlDatastore://workspaceblobstore/UI/2023-07-1...,uncertain,1,2023,5,14,23,agatebeachor,jpg,,,2023-05-14 23:00:00,2023-05-14 21:00:00-07:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8932,2022_09-28.2100_otterrockor.jpg,AmlDatastore://workspaceblobstore/UI/2023-10-2...,foggy,1,2022,9,28,21,otterrockor,jpg,,,2022-09-28 21:00:00,2022-09-28 19:00:00-07:00
8933,2023_08-20.1400_otterrockor.jpg,AmlDatastore://workspaceblobstore/UI/2023-10-2...,not_foggy,1,2023,8,20,14,otterrockor,jpg,,,2023-08-20 14:00:00,2023-08-20 12:00:00-07:00
8934,2022_08-18.1900_agatebeachor.jpg,AmlDatastore://workspaceblobstore/UI/2023-10-2...,foggy,1,2022,8,18,19,agatebeachor,jpg,,,2022-08-18 19:00:00,2022-08-18 17:00:00-07:00
8935,2023_06-14.1130_cannonbeach.jpg,AmlDatastore://workspaceblobstore/UI/2023-10-2...,not_foggy,1,2023,6,14,11,cannonbeach,jpg,,,2023-06-14 11:00:00,2023-06-14 09:00:00-07:00


In [16]:
image_summary=image_summary.loc[image_summary['Label'].isin(['not_foggy','foggy'])].reset_index(drop=True)
image_summary['Label_train'] = 1*(image_summary['Label']=='foggy')


In [24]:
image_summary['Label_train'].value_counts()

0    6853
1    1776
Name: Label_train, dtype: int64

In [19]:
####train test split
X = np.array(image_summary.photo)
y = np.array(image_summary.Label_train)

X_balanced , y_balanced = balance_dataset(X,y)

X_train, X_test_validate, y_train, y_test_validate = train_test_split(
        X_balanced, y_balanced, test_size=0.20, random_state=42, stratify = y_balanced)

X_test, X_validate, y_test, y_validate = train_test_split(
        X_test_validate, y_test_validate, test_size=0.25, random_state=42, stratify = y_test_validate)


[1776 1776]


In [25]:
1776+1776
2841+533+178

3552

In [22]:
len(y_balanced)

3552

In [20]:
print('Xs', ' ', len(X_train), len(X_test), len(X_validate))
print('ys', ' ', len(y_train), len(y_test), len(y_validate))

Xs   2841 533 178
ys   2841 533 178


In [29]:
for i in range(len(X_train)):
    photo_path = image_dir+X_train[i]
    if y_train[i] == 1:
        cnn_path = 'cnn_images/train/foggy/'+X_train[i]
    else:
        cnn_path =  'cnn_images/train/not_foggy/'+X_train[i]
    shutil.copy(photo_path,cnn_path )

for i in range(len(X_test)):
    photo_path = image_dir+X_test[i]
    if y_test[i] == 1:
        cnn_path = 'cnn_images/test/foggy/'+X_test[i]
    else:
        cnn_path =  'cnn_images/test/not_foggy/'+X_test[i]
    shutil.copy(photo_path,cnn_path )


In [31]:

for i in range(len(X_validate)):
    photo_path = image_dir+X_validate[i]
    if y_validate[i] == 1:
        cnn_path = 'cnn_images/validation/foggy/'+X_validate[i]
    else:
        cnn_path =  'cnn_images/validation/not_foggy/'+X_validate[i]
    shutil.copy(photo_path,cnn_path )
