# extract data 


In [None]:
import numpy as np
import pandas as pd
import os
import shutil
import torch
from PIL import Image
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.makedirs('./benign', exist_ok=True)
os.makedirs('./malignant', exist_ok=True)
def getListOfFiles(dirName):
    listOfFile = os.listdir(dirName)
    allFiles = []
    for entry in listOfFile:
        fullPath = os.path.join(dirName, entry)
        if os.path.isdir(fullPath):
            allFiles.extend(getListOfFiles(fullPath))
        else:
            allFiles.append(fullPath)
    return allFiles
files_benign = getListOfFiles('../input/breakhis/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/benign')
for f in files_benign:
    if f.endswith('.png'):
        shutil.copy(f, './benign')
files_malignant = getListOfFiles('../input/breakhis/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/malignant')
for f in files_malignant:
    if f.endswith('.png'):
        shutil.copy(f, './malignant')

# Get file paths for benign and malignant images and sort them
benign_images = sorted(getListOfFiles('./benign'))
malignant_images = sorted(getListOfFiles('./malignant'))

# Create a DataFrame to store image paths and targets
# Assign label 0 for benign and 1 for malignant
image_paths = benign_images + malignant_images
labels = [0]*len(benign_images) + [1]*len(malignant_images)
data = pd.DataFrame({'image': image_paths, 'target': labels})

# Output the result
print(f"Total data samples: {data.shape[0]}")
print(data.head())


# Importing Libraries and Data Preparation

In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import torch
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from torchvision import transforms

from transformers import (
    AutoImageProcessor,
    BeitModel,
    Swinv2Model,
    ConvNextModel,
    ViTModel,
    CLIPProcessor,
    CLIPModel,
    AutoModel
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def extract_magnification(file_path):
    filename = os.path.basename(file_path)
    parts = filename.split('-')
    if len(parts) >= 4:
        return parts[3]
    else:
        return None

data['magnification'] = data['image'].apply(extract_magnification)

if data['magnification'].isnull().any():
    print("Some magnification values were not extracted. These rows will be removed.")
    data = data.dropna(subset=['magnification'])

train_val_data, test_data = train_test_split(
    data,
    test_size=0.3,
    random_state=62,
    stratify=data['target']
)
train_val_data = train_val_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

train_data, val_data = train_test_split(
    train_val_data,
    test_size=0.1,
    random_state=62,
    stratify=train_val_data['target']
)
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)


# Data Augmentation and Model Loading

In [None]:
light_augmentation = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=10),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
])

augmented_dir = './augmented_images'
os.makedirs(augmented_dir, exist_ok=True)

print("Applying data augmentation and saving augmented images...")
augmented_images = []
for idx, row in tqdm(train_data.iterrows(), total=train_data.shape[0]):
    image_path = row['image']
    target = row['target']
    magnification = row['magnification']
    try:
        image = Image.open(image_path).convert('RGB')
        augmented_image = light_augmentation(image)

        base_name = os.path.basename(image_path)
        new_name = f"aug_{idx}_{base_name}"
        new_path = os.path.join(augmented_dir, new_name)

        augmented_image.save(new_path)

        augmented_images.append({
            'image': new_path,
            'target': target,
            'magnification': magnification
        })
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")

augmented_data = pd.DataFrame(augmented_images)

train_data = pd.concat([train_data, augmented_data], ignore_index=True)
print(f"Number of training samples after augmentation: {train_data.shape[0]}")

beit_processor = AutoImageProcessor.from_pretrained('microsoft/beit-large-patch16-224-pt22k-ft22k')
beit_model = BeitModel.from_pretrained('microsoft/beit-large-patch16-224-pt22k-ft22k').to(device)

swin_processor = AutoImageProcessor.from_pretrained('microsoft/swinv2-large-patch4-window12-192-22k')
swin_model = Swinv2Model.from_pretrained('microsoft/swinv2-large-patch4-window12-192-22k').to(device)

convnext_processor = AutoImageProcessor.from_pretrained('facebook/convnext-large-224-22k-1k')
convnext_model = ConvNextModel.from_pretrained('facebook/convnext-large-224-22k-1k').to(device)

vit_processor = AutoImageProcessor.from_pretrained('google/vit-large-patch16-224-in21k')
vit_model = ViTModel.from_pretrained('google/vit-large-patch16-224-in21k').to(device)

clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-large-patch14')
clip_model = CLIPModel.from_pretrained('openai/clip-vit-large-patch14').to(device)

phikon_processor = AutoImageProcessor.from_pretrained("owkin/phikon")
phikon_model = AutoModel.from_pretrained("owkin/phikon").to(device)


# Extract features and filter data based on magnification

In [None]:
def extract_features_batch(image_paths):
    images = []
    valid_indices = []
    for idx, path in enumerate(image_paths):
        try:
            image = Image.open(path).convert('RGB')
            images.append(image)
            valid_indices.append(idx)
        except Exception as e:
            print(f"Error loading image {path}: {e}")
    if not images:
        return np.array([]), []

    beit_inputs = beit_processor(images=images, return_tensors="pt", padding=True).to(device)
    swin_inputs = swin_processor(images=images, return_tensors="pt", padding=True).to(device)
    convnext_inputs = convnext_processor(images=images, return_tensors="pt", padding=True).to(device)
    vit_inputs = vit_processor(images=images, return_tensors="pt", padding=True).to(device)
    clip_inputs = clip_processor(images=images, return_tensors="pt", padding=True).to(device)
    phikon_inputs = phikon_processor(images=images, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        beit_outputs = beit_model(**beit_inputs)
        swin_outputs = swin_model(**swin_inputs)
        convnext_outputs = convnext_model(**convnext_inputs)
        vit_outputs = vit_model(**vit_inputs)
        clip_outputs = clip_model.get_image_features(**clip_inputs)
        phikon_outputs = phikon_model(**phikon_inputs)

    beit_features = beit_outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    swin_features = swin_outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    convnext_features = convnext_outputs.last_hidden_state.mean(dim=[2, 3]).cpu().numpy()
    vit_features = vit_outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    clip_features = clip_outputs.cpu().numpy()
    phikon_features = phikon_outputs.last_hidden_state.mean(dim=1).cpu().numpy()

    features = np.concatenate(
        [beit_features, convnext_features, swin_features, vit_features, clip_features, phikon_features],
        axis=1
    )
    return features, valid_indices

def process_in_batches(image_paths, batch_size=32):
    all_features = []
    all_indices = []
    num_batches = len(image_paths) // batch_size + int(len(image_paths) % batch_size != 0)

    for i in tqdm(range(num_batches), desc="Processing batches"):
        batch_paths = image_paths[i * batch_size: (i + 1) * batch_size]
        batch_features, valid_indices = extract_features_batch(batch_paths)
        if batch_features.size > 0:
            all_features.append(batch_features)
            batch_indices = [i * batch_size + idx for idx in valid_indices]
            all_indices.extend(batch_indices)
    if all_features:
        all_features = np.concatenate(all_features, axis=0)
    else:
        all_features = np.array([])
    return all_features, all_indices

print("Extracting features for training set...")
X_train, train_indices = process_in_batches(
    train_data['image'].tolist(),
    batch_size=120
)
print("Extracting features for validation set...")
X_val, val_indices = process_in_batches(
    val_data['image'].tolist(),
    batch_size=120
)
print("Extracting features for test set...")
X_test, test_indices = process_in_batches(
    test_data['image'].tolist(),
    batch_size=120
)

train_data = train_data.iloc[train_indices].reset_index(drop=True)
val_data = val_data.iloc[val_indices].reset_index(drop=True)
test_data = test_data.iloc[test_indices].reset_index(drop=True)

y_train = train_data['target'].values
y_val = val_data['target'].values
y_test = test_data['target'].values

assert X_train.shape[0] == y_train.shape[0], "Mismatch between training features and labels"
assert X_val.shape[0] == y_val.shape[0], "Mismatch between validation features and labels"
assert X_test.shape[0] == y_test.shape[0], "Mismatch between test features and labels"

def filter_all_magnifications(train_data, val_data, test_data, X_train, y_train, X_val, y_val, X_test, y_test, magnifications):
    filtered_data = {}
    for mag in magnifications:
        train_mask = train_data['magnification'] == mag
        val_mask = val_data['magnification'] == mag
        test_mask = test_data['magnification'] == mag

        X_train_filtered = X_train[train_mask.values]
        y_train_filtered = y_train[train_mask.values]
        X_val_filtered = X_val[val_mask.values]
        y_val_filtered = y_val[val_mask.values]
        X_test_filtered = X_test[test_mask.values]
        y_test_filtered = y_test[test_mask.values]

        filtered_data[mag] = {
            'X_train': X_train_filtered,
            'y_train': y_train_filtered,
            'X_val': X_val_filtered,
            'y_val': y_val_filtered,
            'X_test': X_test_filtered,
            'y_test': y_test_filtered
        }
    return filtered_data

magnifications = ['40', '100', '200', '400']

filtered_data = filter_all_magnifications(
    train_data, val_data, test_data, X_train, y_train, X_val, y_val, X_test, y_test, magnifications
)

X_train_40 = filtered_data['40']['X_train']
y_train_40 = filtered_data['40']['y_train']
X_val_40 = filtered_data['40']['X_val']
y_val_40 = filtered_data['40']['y_val']
X_test_40 = filtered_data['40']['X_test']
y_test_40 = filtered_data['40']['y_test']

X_train_100 = filtered_data['100']['X_train']
y_train_100 = filtered_data['100']['y_train']
X_val_100 = filtered_data['100']['X_val']
y_val_100 = filtered_data['100']['y_val']
X_test_100 = filtered_data['100']['X_test']
y_test_100 = filtered_data['100']['y_test']

X_train_200 = filtered_data['200']['X_train']
y_train_200 = filtered_data['200']['y_train']
X_val_200 = filtered_data['200']['X_val']
y_val_200 = filtered_data['200']['y_val']
X_test_200 = filtered_data['200']['X_test']
y_test_200 = filtered_data['200']['y_test']

X_train_400 = filtered_data['400']['X_train']
y_train_400 = filtered_data['400']['y_train']
X_val_400 = filtered_data['400']['X_val']
y_val_400 = filtered_data['400']['y_val']
X_test_400 = filtered_data['400']['X_test']
y_test_400 = filtered_data['400']['y_test']

print(f"Number of X_train_40: {len(X_train_40)}")
print(f"Number of y_train_40: {len(y_train_40)}")
print(f"Number of X_val_40: {len(X_val_40)}")
print(f"Number of y_val_40: {len(y_val_40)}")
print(f"Number of X_test_40: {len(X_test_40)}")
print(f"Number of y_test_40: {len(y_test_40)}\n")

print(f"Number of X_train_100: {len(X_train_100)}")
print(f"Number of y_train_100: {len(y_train_100)}")
print(f"Number of X_val_100: {len(X_val_100)}")
print(f"Number of y_val_100: {len(y_val_100)}")
print(f"Number of X_test_100: {len(X_test_100)}")
print(f"Number of y_test_100: {len(y_test_100)}\n")

print(f"Number of X_train_200: {len(X_train_200)}")
print(f"Number of y_train_200: {len(y_train_200)}")
print(f"Number of X_val_200: {len(X_val_200)}")
print(f"Number of y_val_200: {len(y_val_200)}")
print(f"Number of X_test_200: {len(X_test_200)}")
print(f"Number of y_test_200: {len(y_test_200)}\n")

print(f"Number of X_train_400: {len(X_train_400)}")
print(f"Number of y_train_400: {len(y_train_400)}")
print(f"Number of X_val_400: {len(X_val_400)}")
print(f"Number of y_val_400: {len(y_val_400)}")
print(f"Number of X_test_400: {len(X_test_400)}")
print(f"Number of y_test_400: {len(y_test_400)}")


# Training Models for Different Magnifications

In [None]:
batch = 16
saves = []
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import numpy as np
import os
import zipfile


def train_model(X_train, y_train, X_val, y_val, X_test, y_test, mag, save_prefix):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    y_train = np.array(y_train).astype('float32')
    y_val = np.array(y_val).astype('float32')
    y_test = np.array(y_test).astype('float32')
    
    model = Sequential([
        Dense(2048, input_dim=X_train_scaled.shape[1], activation='relu'),
        BatchNormalization(),
        Dropout(0.5),

        Dense(1024, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        
        Dense(512, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),

        Dense(1, activation='sigmoid')
    ])
    
    initial_lr = 0.001
    optimizer = tf.keras.optimizers.Adam(learning_rate=initial_lr)
    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    checkpoint = ModelCheckpoint(f'best_model_{mag}.keras', monitor='val_loss', save_best_only=True, mode='min', verbose=1)
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4, min_lr=1e-9, verbose=1)
    
    history = model.fit(
        X_train_scaled, y_train,
        epochs=50,
        batch_size=batch,
        validation_data=(X_val_scaled, y_val),
        callbacks=[checkpoint, lr_scheduler],
        verbose=1
    )
    
    y_pred = model.predict(X_test_scaled)
    y_pred = np.round(y_pred).astype(int).flatten()
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f'Test Accuracy for magnification {mag}: {test_accuracy}')
    
    save_path = f'{save_prefix}_{batch}_{mag}_accuracy_{test_accuracy}.keras'
    saves.append(save_path)
    model.save(save_path)
    return test_accuracy

def train_models_for_all_magnifications(
    X_train_40, y_train_40, X_val_40, y_val_40, X_test_40, y_test_40,
    X_train_100, y_train_100, X_val_100, y_val_100, X_test_100, y_test_100,
    X_train_200, y_train_200, X_val_200, y_val_200, X_test_200, y_test_200,
    X_train_400, y_train_400, X_val_400, y_val_400, X_test_400, y_test_400):
    
    accuracies = {}
    accuracies['40'] = train_model(
        X_train_40, y_train_40, X_val_40, y_val_40, X_test_40, y_test_40, '40', 'model_40'
    )
    accuracies['100'] = train_model(
        X_train_100, y_train_100, X_val_100, y_val_100, X_test_100, y_test_100, '100', 'model_100'
    )
    accuracies['200'] = train_model(
        X_train_200, y_train_200, X_val_200, y_val_200, X_test_200, y_test_200, '200', 'model_200'
    )
    accuracies['400'] = train_model(
        X_train_400, y_train_400, X_val_400, y_val_400, X_test_400, y_test_400, '400', 'model_400'
    )
    return accuracies

accuracies = train_models_for_all_magnifications(
    X_train_40, y_train_40, X_val_40, y_val_40, X_test_40, y_test_40,
    X_train_100, y_train_100, X_val_100, y_val_100, X_test_100, y_test_100,
    X_train_200, y_train_200, X_val_200, y_val_200, X_test_200, y_test_200,
    X_train_400, y_train_400, X_val_400, y_val_400, X_test_400, y_test_400
)

print("Accuracies for each magnification level:")
for mag, acc in accuracies.items():
    print(f"{mag}x magnification: {acc}")

num_samples = {
    '40': len(X_test_40),
    '100': len(X_test_100),
    '200': len(X_test_200),
    '400': len(X_test_400)
}

total_samples = sum(num_samples.values())

weighted_accuracy = sum((num_samples[mag] / total_samples) * acc for mag, acc in accuracies.items())

print(f"Weighted accuracy across all magnifications: {weighted_accuracy}")


def zip_saved_models(file_paths, zip_filename):
    with zipfile.ZipFile(zip_filename, 'w') as zipf:
        for file_path in file_paths:
            if os.path.isfile(file_path): 
                zipf.write(file_path, os.path.basename(file_path))
            else:
                print(f"File not found: {file_path}")
    
    print(f"Selected files have been zipped into {zip_filename}")

output_zip_filename = f'{weighted_accuracy}.zip'

zip_saved_models(saves, output_zip_filename)


#  Use the Phikon model for 100x magnification and the combined model for 40x, 200x, and 400x magnifications.

In [None]:
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import FileLink, display
import joblib
from sklearn.preprocessing import StandardScaler
X_test_100_ph = np.load('/kaggle/input/ph-data/X_test_100_ph.npy')
y_test_100_ph = np.load('/kaggle/input/ph-data/y_test_100_ph.npy')
def recreate_and_save_scaler(X_train, mag):
    """
    Recreate the StandardScaler using training data and save it.
    
    Args:
        X_train (numpy.ndarray): Training features.
        mag (str): Magnification level identifier.
    """
    scaler = StandardScaler()
    scaler.fit(X_train)
    scaler_filename = f'scaler_{mag}_all.pkl'
    joblib.dump(scaler, scaler_filename)
    print(f'Scaler for {mag}x magnification saved as {scaler_filename}')

recreate_and_save_scaler(X_train_40, '40')
recreate_and_save_scaler(X_test_100_ph, '100')
recreate_and_save_scaler(X_train_200, '200')
recreate_and_save_scaler(X_train_400, '400')

def load_scaler(scaler_filename):
    if os.path.exists(scaler_filename):
        scaler = joblib.load(scaler_filename)
        print(f'Scaler loaded from {scaler_filename}')
        return scaler
    else:
        print(f'Scaler file {scaler_filename} not found.')
        return None

def load_and_predict(model_path, scaler, X_test, y_test, mag):
    model = load_model(model_path)
    print(f'Model loaded from {model_path}')
    
    if scaler is not None:
        X_test_scaled = scaler.transform(X_test)
    else:
        print('Scaler is None. Proceeding without scaling.')
        X_test_scaled = X_test  
    
    y_pred_prob = model.predict(X_test_scaled)
    y_pred = np.round(y_pred_prob).astype(int).flatten()
    
    acc = accuracy_score(y_test, y_pred)
    print(f'Test Accuracy for {mag}x Magnification: {acc:.4f}')
    
    print(f'Classification Report for {mag}x Magnification:')
    print(classification_report(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    print(f'Confusion Matrix for {mag}x Magnification:')
    print(cm)
    
    return acc, cm

def plot_confusion_matrices(confusion_matrices, magnifications, figsize=(20, 5), save_path=None):
    num_mags = len(magnifications)
    fig, axes = plt.subplots(1, num_mags, figsize=figsize)
    
    if num_mags == 1:
        axes = [axes]  
    
    for ax, mag in zip(axes, magnifications):
        cm = confusion_matrices[mag]
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax)
        ax.set_title(f'Confusion Matrix: {mag}x')
        ax.set_xlabel('Predicted Labels')
        ax.set_ylabel('True Labels')
        ax.xaxis.set_ticklabels(['Negative (0)', 'Positive (1)'])
        ax.yaxis.set_ticklabels(['Negative (0)', 'Positive (1)'])
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path)
        print(f'Confusion matrices saved to {save_path}')
    
    plt.show()

def create_download_link(filepath, title="Download Image"):
    if os.path.exists(filepath):
        display(FileLink(filepath, result_html_prefix=f'<p>{title}: '))
    else:
        print(f"File {filepath} not found.")

model_info = {
    '40': {
        'model_path': "/kaggle/input/model-99.1-not-ph/pytorch/default/1/model_40_16_40_accuracy_0.9919743178170144.keras",
        'scaler_path': '/kaggle/input/scaler-ph-and-other/scaler_40_all.pkl',
        'X_test': X_test_40,
        'y_test': y_test_40
    },
    '100': {
        'model_path':"/kaggle/input/ph-100-93/pytorch/default/1/model_100_16_100_accuracy_0.9932318104906938.keras" ,
        'scaler_path': '/kaggle/input/scaler-ph-and-other/scaler_100.pkl',
        'X_test': X_test_100_ph,
        'y_test': y_test_100_ph
    },
    '200': {
        'model_path': '/kaggle/input/model-99.1-not-ph/pytorch/default/1/model_200_16_200_accuracy_0.9982758620689656.keras',
        'scaler_path': '/kaggle/input/scaler-ph-and-other/scaler_200_all.pkl',
        'X_test': X_test_200,
        'y_test': y_test_200
    },
    '400': {
        'model_path': '/kaggle/input/model-99.1-not-ph/pytorch/default/1/model_400_16_400_accuracy_0.9913644214162349.keras',
        'scaler_path': '/kaggle/input/scaler-ph-and-other/scaler_400_all.pkl',
        'X_test': X_test_400,
        'y_test': y_test_400
    }
}

test_accuracies = {}
confusion_matrices = {}

for mag, info in model_info.items():
    model_path = info['model_path']
    scaler_path = info['scaler_path']
    X_test = info['X_test']
    y_test = info['y_test']
    
    print(f"\nProcessing {mag}x Magnification:")
    
    scaler = load_scaler(scaler_path)
    
    acc, cm = load_and_predict(model_path, scaler, X_test, y_test, mag)
    
    test_accuracies[mag] = acc
    confusion_matrices[mag] = cm

print("\nTest Accuracies for Each Magnification Level:")
for mag, acc in test_accuracies.items():
    print(f"{mag}x Magnification: {acc:.4f}")

magnifications = ['40', '100', '200', '400']

plot_confusion_matrices(confusion_matrices, magnifications, save_path='confusion_matrices.png')

create_download_link('confusion_matrices.png', title="Click here to download the Confusion Matrices Image")
