In [None]:
import numpy as np
import pandas as pd
import cv2
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.preprocessing import image
from xgboost import XGBClassifier
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Load InceptionV3 model without the top layer
weights = '/kaggle/input/inceptionv3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5'
base_model = InceptionV3(weights=weights, include_top=False, pooling='avg')

# CutMix Augmentation
def cutmix(image, image2, alpha=1.0):
    # Get dimensions of both images
    h, w, _ = image.shape
    h2, w2, _ = image2.shape
    
    # Ensure both images are of the same size by resizing image2 if necessary
    if (h2 != h) or (w2 != w):
        image2 = cv2.resize(image2, (w, h))

    # Get random lambda value
    lam = np.random.beta(alpha, alpha)

    # Random box coordinates
    cut_rat = np.sqrt(1.0 - lam)
    cut_w = int(w * cut_rat)
    cut_h = int(h * cut_rat)

    # Center of the bounding box
    cx = np.random.randint(w)
    cy = np.random.randint(h)

    # Bounding box coordinates with clipping to image boundaries
    bbx1 = np.clip(cx - cut_w // 2, 0, w)
    bby1 = np.clip(cy - cut_h // 2, 0, h)
    bbx2 = np.clip(cx + cut_w // 2, 0, w)
    bby2 = np.clip(cy + cut_h // 2, 0, h)

    # If the box has zero width or height, skip CutMix for this iteration
    if bbx1 >= bbx2 or bby1 >= bby2:
        return image, lam  # Return the original image and lambda

    # Perform CutMix
    new_image = image.copy()
    new_image[bby1:bby2, bbx1:bbx2, :] = image2[bby1:bby2, bbx1:bbx2, :]
    
    # Update lambda based on the area of the bounding box
    lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (h * w))

    return new_image, lam

# Function to extract features using InceptionV3
def extract_features(img_path, model, img_size=(299, 299)):
    img = image.load_img(img_path, target_size=img_size)
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    features = model.predict(img_array)
    return features.flatten()

# Function to load and extract features
def load_and_extract_features(csv_path, images_folder, model, apply_cutmix=False):
    data = pd.read_csv(csv_path)
    features = []
    labels = []

    for index, row in data.iterrows():
        image_name = row['id_code']
        label = row['diagnosis']
        image_path = f"{images_folder}/{image_name}.png"
        
        # CutMix Augmentation
        if apply_cutmix:
            img = cv2.imread(image_path)
            rand_index = np.random.choice(data.index)
            random_image_name = data.loc[rand_index, 'id_code']
            random_image_path = f"{images_folder}/{random_image_name}.png"
            img2 = cv2.imread(random_image_path)

            cutmix_img, lam = cutmix(img, img2)
            cutmix_img = cv2.resize(cutmix_img, (299, 299))
            img_array = np.expand_dims(cutmix_img, axis=0)
            img_array = preprocess_input(img_array)
            feature = model.predict(img_array).flatten()
        else:
            feature = extract_features(image_path, model)
        
        features.append(feature)
        labels.append(label)
    
    return np.array(features), np.array(labels)

# Function for K-fold validation
def cross_validate_model(X, y, model, n_splits=3):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracy_scores = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        model.fit(X_train, y_train)
        y_val_pred = model.predict(X_val)
        accuracy = accuracy_score(y_val, y_val_pred)
        accuracy_scores.append(accuracy)
        print(f"Fold Accuracy: {accuracy * 100:.2f}%")

    avg_accuracy = np.mean(accuracy_scores)
    print(f"Average Cross-Validation Accuracy: {avg_accuracy * 100:.2f}%")
    return avg_accuracy

# Function to load test data and extract features using the base model
def load_test_data_and_extract_features(csv_path, images_folder, model):
    data = pd.read_csv(csv_path)
    features = []
    image_names = []
    
    for index, row in data.iterrows():
        image_name = row['id_code']  # Adjust this if your CSV uses a different column name
        image_path = f"{images_folder}/{image_name}.png"
        
        # Extract features using pretrained model (like InceptionV3 in this case)
        feature = extract_features(image_path, model)
        
        features.append(feature)
        image_names.append(image_name)
    
    return np.array(features), image_names

# Function to generate predictions and save them to CSV
def predict_and_generate_csv(model, test_features, image_names, output_csv_path):
    predictions = model.predict(test_features)
    output_df = pd.DataFrame({
        'id_code': image_names,
        'diagnosis': predictions
    })
    output_df.to_csv('submission.csv', index=False)
    print(f"Predictions saved to {output_csv_path}")

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# Main Program
if __name__ == "__main__":
    # Paths
    train_csv_path = '/kaggle/input/aptos2019-blindness-detection/train.csv'  # CSV with image names and labels
    train_images_folder = '/kaggle/input/aptos2019-blindness-detection/train_images'  # Folder with training images
    test_csv_path = '/kaggle/input/aptos2019-blindness-detection/test.csv'  # CSV with test image names
    test_images_folder = '/kaggle/input/aptos2019-blindness-detection/test_images'  # Folder with test images
    output_csv_path = '/kaggle/input/aptos2019-blindness-detection/submission.csv'  # Output CSV file for predictions

    # 1. Load and extract features from the training data with CutMix augmentation
    print("Loading and extracting features from training data with CutMix...")
    X, y = load_and_extract_features(train_csv_path, train_images_folder, base_model, apply_cutmix=True)

    # 2. Encode labels to numeric values
    from sklearn.preprocessing import LabelEncoder
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # 3. Perform 3-Fold Cross-Validation
    n_folds = 5
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    fold_accuracies = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_encoded)):
        print(f"Training fold {fold + 1}/{n_folds}...")
        
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]
        
        ### XGBoost Classifier ###
        print("Training XGBoost model...")
        xgb_model = XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
        xgb_model.fit(X_train, y_train)
        
        # Validate the model on this fold
        y_val_pred = xgb_model.predict(X_val)
        accuracy_xgb = accuracy_score(y_val, y_val_pred)
        fold_accuracies.append(accuracy_xgb)
        
        print(f"Fold {fold + 1} Accuracy: {accuracy_xgb * 100:.2f}%")
    
    # Print overall average accuracy across folds
    avg_accuracy = np.mean(fold_accuracies)
    print(f"Average Cross-Validation Accuracy: {avg_accuracy * 100:.2f}%")

    # 4. Load and extract features from the test data
    print("Loading and extracting features from test data...")
    test_features, image_names = load_test_data_and_extract_features(test_csv_path, test_images_folder, base_model)

    # 5. Predict test labels and generate CSV output
    print("Generating predictions and saving to CSV...")
    predict_and_generate_csv(xgb_model, test_features, image_names, output_csv_path)