<a href="https://colab.research.google.com/github/singaravelan/Loan-Prediction/blob/master/AVHack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'avdataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4395885%2F7547843%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240208%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240208T153028Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D7ffaa50cd021959072afce4b71f12e71558a338382ae0f14f09b686066170797ca642e2875abf35c316b33c5d7507dd713de692e0e0527da8fa30096e9ac785c6869144d1947408b215a2ae7c30589d0e22f822f8e3458c67da91880f3aae7ac99d8f4812c26e8f12584a55d9a2062217163af5fea4417a6154ff797ff07a8fc54101ad477c9ba6301d0126b85e06423c8709924b99b30a639d7a3269b2ac915444dc1cf390e19bea7ac9872c9246f5d2ad655f230221a66120dbf02784b1d67ec444b30eeaead5f4dc826e1e96b00faf39416381f54a3fbd1c8217eae34131e56793ad14fb21b7603aa8f82de5d987d0b7ce379a586314899cce3f5d7c64eea'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import keras
import numpy as np

In [None]:
class CFG:
    verbose = 1  # Verbosity
    seed = 42  # Random seed
    preset = "efficientnetv2_b2_imagenet"  # Name of pretrained classifier
    image_size = [256, 256]  # Input image size
    epochs = 20 # Training epochs
    batch_size = 32  # Batch size
    lr_mode = "cos" # LR scheduler mode from one of "cos", "step", "exp"
    drop_remainder = True  # Drop incomplete batches
    num_classes = 2 # Number of classes in the dataset
    fold = 0 # Which fold to set as validation data
    class_names = [0,1]
    label2name = dict(enumerate(class_names))
    name2label = {v:k for k, v in label2name.items()}

In [None]:
import os
import pandas as pd

def remove_nonexistent_files(df, image_folder_column='filename', image_root_folder='.'):
    """
    Remove rows from DataFrame where the files listed in the specified column do not exist.

    Parameters:
    - df: DataFrame containing the filenames to be checked.
    - image_folder_column: Name of the column containing filenames.
    - image_root_folder: Root folder where the image files are expected to be.

    Returns:
    - DataFrame with rows removed for nonexistent files.
    """
    # Create a function to check if a file exists
    def file_exists(filename):
        full_path = os.path.join(image_root_folder, filename)
        return os.path.exists(full_path)

    # Apply the function to check file existence
    mask = df[image_folder_column].apply(file_exists)

    # Filter the DataFrame to keep only rows with existing files
    df_filtered = df[mask]

    return df_filtered


In [None]:
import os.path
from typing import Dict, Tuple

import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

def get_train_val_datasets(
    batch_size: int = 32,
    image_size: Tuple[int, int] = (256, 256),
    validation_split: float = 0.2,
) -> Tuple[tf.data.Dataset, tf.data.Dataset]:

    # Read the CSV file
    data = pd.read_csv(os.path.join("/kaggle/input/avdataset/train/train.csv"))


    # Create file paths for images
    images_path = os.path.join("/kaggle/input/avdataset/train/images")
    data=remove_nonexistent_files(data, 'filename', images_path)
    data["image_id"] = data["image_id"].map(lambda x: os.path.join(images_path, f"{x}.jpg"))

    # Map class labels to integers
    data["label"] = data["label"]
    class_name_to_label: Dict[str, int] = {
        label: i for i, label in enumerate(set(data["label"]))
    }
    labels: tf.Tensor = tf.constant(
        data["label"].map(class_name_to_label.__getitem__), dtype=tf.uint8
    )

    # Convert TensorFlow tensors to NumPy arrays for train_test_split
    image_ids_np = data["image_id"].to_numpy()
    labels_np = labels.numpy()

    # Split the dataset into training and validation sets
    train_data, val_data, train_labels, val_labels = train_test_split(
        image_ids_np, labels_np, test_size=validation_split, random_state=42
    )

    # Create TensorFlow datasets for training and validation
    train_dataset = create_dataset(train_data, train_labels, image_size, batch_size, augment=True)
    val_dataset = create_dataset(val_data, val_labels, image_size, batch_size)

    return train_dataset, val_dataset

def create_dataset(filenames, labels, image_size, batch_size, augment=False):
    # Create a TensorFlow dataset from filenames and labels
    dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))

    # Define a parsing function to read and preprocess images
    def _parse_function(filename, label):
        jpg_image = tf.io.decode_jpeg(tf.io.read_file(filename))

        # Image augmentation (if specified)
        if augment:
            jpg_image = tf.image.random_flip_left_right(jpg_image)
            jpg_image = tf.image.random_flip_up_down(jpg_image)
            # Add more augmentations as needed

        return tf.image.resize(jpg_image, size=image_size), label

    # Apply the parsing function to each element in the dataset
    dataset = dataset.map(_parse_function)

    # Batch the dataset
    return dataset.batch(batch_size)

# Example usage:

validation_split = 0.1

train_dataset, val_dataset = get_train_val_datasets(
    batch_size=CFG.batch_size, image_size=CFG.image_size, validation_split=validation_split
)


In [None]:
label_counts = {}
iterator = iter(val_dataset)
# Assuming each batch contains images and labels
images, labels = next(iterator)

# Assuming labels are numerical and stored as numpy arrays
for label in labels:
    label_str = str(label)

    if label_str in label_counts:
        label_counts[label_str] += 1
    else:
        label_counts[label_str] = 1

total_samples = len(labels)
label_percentages = {label: count / total_samples * 100 for label, count in label_counts.items()}

print("Label Percentages:")
for label, percentage in label_percentages.items():
    print(f"Label {label}: {percentage:.2f}%")


In [None]:
import matplotlib.pyplot as plt

def plot_images(dataset, num_images=9):
    # Create an iterator to extract batches from the dataset
    iterator = iter(dataset)

    # Get a batch of images and labels
    images, labels = next(iterator)

    # Plot images in a 3x3 grid
    plt.figure(figsize=(5, 5))
    for i in range(min(num_images, len(images))):
        plt.subplot(3, 3, i + 1)
        plt.imshow(images[i].numpy().astype("uint8"))
        plt.title(f"Label: {labels[i].numpy()}")
        plt.axis("off")

    plt.show()

# Plot images from the training dataset
plot_images(train_dataset)


In [None]:
import os
import tensorflow as tf
import pandas as pd

def create_test_dataset(test_data, image_folder, image_size, batch_size):
    # Convert 'image_id' column to strings
    test_data["image_id"] = test_data["image_id"].astype(str)

    filenames: tf.Tensor = tf.constant(test_data["image_id"], dtype=tf.string)
    file_paths = tf.strings.join([image_folder, "/", filenames, ".jpg"])

    dataset = tf.data.Dataset.from_tensor_slices(file_paths)

    def _parse_function(filename):
        jpg_image: tf.Tensor = tf.io.decode_jpeg(tf.io.read_file(filename))
        return tf.image.resize(jpg_image, size=image_size)

    dataset = dataset.map(_parse_function)
    return dataset.batch(batch_size)

# Example usage:
test_data = pd.read_csv("/kaggle/input/avdataset/test/test.csv")  # Assuming your test CSV has a column 'image_id'
image_folder = "/kaggle/input/avdataset/test/images"  # Adjust this based on your actual test image folder

# Assuming 'test_image_size' and 'test_batch_size' are defined
test_image_size = (256, 256)
test_batch_size = 32

test_dataset = create_test_dataset(test_data, image_folder, test_image_size, test_batch_size)


In [None]:
def plot_images_1(dataset, num_images=9):
    # Create an iterator to extract batches from the dataset
    iterator = iter(dataset)

    # Get a batch of images and labels
    images = next(iterator)

    # Plot images in a 3x3 grid
    plt.figure(figsize=(5, 5))
    for i in range(min(num_images, len(images))):
        plt.subplot(3, 3, i + 1)
        plt.imshow(images[i].numpy().astype("uint8"))
        plt.title(f"Label: ")
        plt.axis("off")

    plt.show()


plot_images_1(test_dataset)

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout

# Load EfficientNetB2 pre-trained model
base_model = EfficientNetB2(weights='imagenet', include_top=False, input_shape=(CFG.image_size[0], CFG.image_size[1], 3))

# Set all layers as non-trainable
for layer in base_model.layers:
    layer.trainable = False

# Create a Sequential model
model = Sequential()

# Add the EfficientNetB2 base model to the Sequential model
model.add(base_model)

# Add GlobalAveragePooling2D and Dropout layers
model.add(GlobalAveragePooling2D())
model.add(Dropout(0.2))  # Add a Dropout layer with a dropout rate of 0.2

# Add a Dense layer for binary classification
model.add(Dense(1, activation='sigmoid'))

# Display the model summary
model.summary()


In [None]:
from tensorflow.keras.optimizers import Adam
# Set the training parameters
model.compile(optimizer=Adam(),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
import math

def get_lr_callback(batch_size=8, mode='cos', epochs=CFG.epochs, plot=False):
    lr_start, lr_max, lr_min = 5e-5, 6e-6 * batch_size, 1e-5
    lr_ramp_ep, lr_sus_ep, lr_decay = 3, 0, 0.75

    def lrfn(epoch):  # Learning rate update function
        if epoch < lr_ramp_ep: lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
        elif epoch < lr_ramp_ep + lr_sus_ep: lr = lr_max
        elif mode == 'exp': lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
        elif mode == 'step': lr = lr_max * lr_decay**((epoch - lr_ramp_ep - lr_sus_ep) // 2)
        elif mode == 'cos':
            decay_total_epochs, decay_epoch_index = epochs - lr_ramp_ep - lr_sus_ep + 3, epoch - lr_ramp_ep - lr_sus_ep
            phase = math.pi * decay_epoch_index / decay_total_epochs
            lr = (lr_max - lr_min) * 0.5 * (1 + math.cos(phase)) + lr_min
        return lr

    if plot:  # Plot lr curve if plot is True
        plt.figure(figsize=(10, 5))
        plt.plot(np.arange(epochs), [lrfn(epoch) for epoch in np.arange(epochs)], marker='o')
        plt.xlabel('epoch'); plt.ylabel('lr')
        plt.title('LR Scheduler')
        plt.show()

    return keras.callbacks.LearningRateScheduler(lrfn, verbose=False)

In [None]:
lr_cb = get_lr_callback(CFG.batch_size, mode=CFG.lr_mode, plot=True)

In [None]:
ckpt_cb = keras.callbacks.ModelCheckpoint("best_model.keras",
                                         monitor='val_loss',
                                         save_best_only=True,
                                         save_weights_only=False,
                                         mode='min')

In [None]:
df = pd.read_csv(os.path.join("/kaggle/input/avdataset/train/train.csv"))
neg, pos = np.bincount(df['label'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

In [None]:
# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

In [None]:
import math

# Note that this may take some time.
history = model.fit(train_dataset,
                    epochs=CFG.epochs,
                    verbose=1,
                    #steps_per_epoch  = math.ceil(len(train_dataset) / CFG.batch_size),
                    validation_data=val_dataset,
                    #callbacks=[lr_cb, ckpt_cb],
                    class_weight=class_weight)

In [None]:
#-----------------------------------------------------------
# Retrieve a list of list results on training and test data
# sets for each training epoch
#-----------------------------------------------------------
acc=history.history['accuracy']
val_acc=history.history['val_accuracy']
loss=history.history['loss']
val_loss=history.history['val_loss']

epochs=range(len(acc)) # Get number of epochs

#------------------------------------------------
# Plot training and validation accuracy per epoch
#------------------------------------------------
plt.plot(epochs, acc, 'r', "Training Accuracy")
plt.plot(epochs, val_acc, 'b', "Validation Accuracy")
plt.title('Training and validation accuracy')
plt.show()
print("")

#------------------------------------------------
# Plot training and validation loss per epoch
#------------------------------------------------
plt.plot(epochs, loss, 'r', "Training Loss")
plt.plot(epochs, val_loss, 'b', "Validation Loss")
plt.show()

In [None]:
import pandas as pd
import tensorflow as tf

# Assuming you have a trained model named 'model'
# Assuming you have a test dataset 'test_dataset' prepared using the create_test_dataset function

# Make predictions on the test dataset
predictions_probabilities = model.predict(test_dataset)

# Threshold the probabilities to get binary predictions (0 or 1)
threshold = 0.5
binary_predictions = (predictions_probabilities > threshold).astype(int).tolist()

binary_predictions = [item for sublist in binary_predictions for item in sublist]

# Create a DataFrame with image_id and predicted labels
test_predictions_df = pd.DataFrame({
    'image_id': test_data['image_id'],  # Assuming your test_data DataFrame has an 'image_id' column
    'label': binary_predictions
})


In [None]:
test_predictions_df.to_csv("output.csv",index=False)