This report was prepared with the assistance of AI

Step 1

In [None]:
from google.colab import files
uploaded = files.upload()

Saving hep_img_large_gt.csv to hep_img_large_gt.csv
Saving hep2img_large.zip to hep2img_large.zip


Steps 2,3,4

In [None]:
import pandas as pd
import zipfile
import os
from skimage.io import imread
import numpy as np
from scipy import stats
from skimage.measure import regionprops
from skimage.feature import graycomatrix, graycoprops

zip_file = '/content/hep2img_large.zip'
extract_folder = '/content/extracted_images2/hep2imglarge/hep2img_large/hep2img_large'

os.makedirs(extract_folder, exist_ok=True)

with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

print("Extraction completed!")

csv_path = '/content/hep_img_large_gt.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv('/content/hep_img_large_gt.csv')

print("First 5 rows of the DataFrame:")
print(df.head())

# Extract image IDs and class labels
case_numbers = df['file']
class_labels = df['class']

# Create empty lists
images = []
masks = []

# Load images and masks from the extracted folder
for id in case_numbers:
    id_str = str(id).zfill(3)  # Ensure filenames have leading zeros if needed

    # Define image and mask paths
    image_path = os.path.join(extract_folder, f"{id_str}.png")
    mask_path = os.path.join(extract_folder, f"{id_str}_Mask.png")

    try:
        image = imread(image_path)
        mask = imread(mask_path)

        images.append(image)
        masks.append(mask)
    except FileNotFoundError:
        print(f"Warning: File {image_path} or {mask_path} not found!")

# Confirm the number of loaded images and masks
print("\nTotal images loaded:", len(images))
print("Total masks loaded:", len(masks))

# First-order feature extraction function
def get_first_order_stats(img):
    entropy = stats.entropy(img, axis=None)
    desc_stats = stats.describe(img, axis=None)
    first_order_stats = np.array([
        desc_stats.mean,
        desc_stats.minmax[0],  # Min
        desc_stats.minmax[1],  # Max
        desc_stats.variance,
        desc_stats.skewness,
        desc_stats.kurtosis,
        entropy
    ])
    return first_order_stats

# Shape feature extraction function
def get_shape_region_features(mask):
    props = regionprops(mask)[0]
    shape_region = np.array([
        props.area,
        props.area_bbox,
        props.convex_area,
        props.eccentricity,
        props.extent,
        props.major_axis_length,
        props.minor_axis_length,
        props.orientation,
        props.perimeter,
        props.solidity,
        props.extent

    ])
    shape_region = np.insert(shape_region, [-1], props.moments_central.flatten())  # Moments
    shape_region = np.insert(shape_region, [-1], props.moments_hu.flatten())  # Hu moments
    return shape_region

# Texture feature extraction function
def get_texture_features(img):
    glcm = graycomatrix(img, [1], [0, np.pi/4, np.pi/2, np.pi*3/4], levels=256, normed=True, symmetric=True)
    texture_features = np.array([])
    texture_features = np.insert(texture_features, [0], graycoprops(glcm, "contrast").flatten())
    texture_features = np.insert(texture_features, [-1], graycoprops(glcm, "dissimilarity").flatten())
    texture_features = np.insert(texture_features, [-1], graycoprops(glcm, "homogeneity").flatten())
    texture_features = np.insert(texture_features, [-1], graycoprops(glcm, "energy").flatten())
    texture_features = np.insert(texture_features, [-1], graycoprops(glcm, "correlation").flatten())
    texture_features = np.insert(texture_features, [-1], graycoprops(glcm, "ASM").flatten())
    return texture_features

# Intensity-based feature function
def get_intensity_features(img):
    # Calculate gradient magnitude
    gradient_x = np.gradient(img, axis=0)
    gradient_y = np.gradient(img, axis=1)
    gradient_magnitude = np.sqrt(gradient_x**2 + gradient_y**2)

    # Get intensity features
    intensity_features = np.array([
        np.mean(gradient_magnitude),       # Mean gradient magnitude
        np.std(gradient_magnitude),        # Standard deviation of gradient
        np.percentile(img, 25),            # 25th percentile intensity
        np.percentile(img, 75),            # 75th percentile intensity
        np.max(img) - np.min(img)          # Dynamic range
    ])
    return intensity_features

# Initialize an empty list to store features
all_features = []
some_features=[]

# Loop through images and masks and extract features
for img, mask in zip(images, masks):
    first_order = get_first_order_stats(img)
    shape_features = get_shape_region_features(mask)
    texture_features = get_texture_features(img)
    intensity_features = get_intensity_features(img)

    # Combine all feature sets
    combined_features = np.concatenate((first_order, shape_features, texture_features))
    challenge2_combined_features = np.concatenate((first_order, intensity_features))

    all_features.append(combined_features)
    some_features.append(challenge2_combined_features)

# Convert to a NumPy array for use in classification
all_features = np.array(all_features)
challenge2_features = np.array(some_features)

# Print feature shape to verify extraction success
print("Feature extraction completed!")
print("Feature array shape:", all_features.shape)
print("Challenge2 features array shape:", challenge2_features.shape)

Extraction completed!
First 5 rows of the DataFrame:
   file  class
0    82      1
1    27      1
2    90      1
3    76      1
4    32      1

Total images loaded: 453
Total masks loaded: 453
Feature extraction completed!
Feature array shape: (453, 65)
Challenge2 features array shape: (453, 12)


Step 5

In [None]:
from sklearn.model_selection import train_test_split

# Define feature matrix and target labels
IMAGE_FEATURE_MATRIX = all_features  # The feature matrix from extracted features
challenge2_matrix = challenge2_features
CLASSES = class_labels  # The labels corresponding to each image

# Split the dataset into training and testing sets (80% train, 20% test)
TRAIN_X, TEST_X, TRAIN_Y, TEST_Y = train_test_split(
    IMAGE_FEATURE_MATRIX,  # Features
    CLASSES,  # Target labels
    test_size=0.2,  # 20% of data for testing
    random_state=42  # Ensures reproducibility
)

TRAIN_X2, TEST_X2, TRAIN_Y2, TEST_Y2 = train_test_split(
    challenge2_matrix,  # Features
    CLASSES,  # Target labels
    test_size=0.2,  # 20% of data for testing
    random_state=42  # Ensures reproducibility
)

print("Training set size:", len(TRAIN_X))
print("Testing set size:", len(TEST_X))
print("Training set2 size:", len(TRAIN_X2))
print("Testing set2 size:", len(TEST_X2))


Training set size: 362
Testing set size: 91
Training set2 size: 362
Testing set2 size: 91


Step 6,7,8

In [None]:
# Decision Tree Learner
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

learner = DecisionTreeClassifier(random_state=42)  # You can specify more hyperparameters here if needed

# Train the learner with the training data
model = learner.fit(TRAIN_X, TRAIN_Y)  # TRAIN_X: features, TRAIN_Y: labels

# Make predictions on the test data
predictions = model.predict(TEST_X)  # TEST_X: test features

# Evaluate the predictions
accuracy = accuracy_score(TEST_Y, predictions)  # TEST_Y: true labels
print("Decision Tree Accuracy:", accuracy)

# Classification report
report = classification_report(TEST_Y, predictions)
print("Classification Report:")
print(report)

Decision Tree Accuracy: 0.7802197802197802
Classification Report:
              precision    recall  f1-score   support

           1       0.77      0.71      0.74        24
           2       0.72      1.00      0.84        13
           3       0.73      0.79      0.76        14
           4       0.83      0.79      0.81        19
           5       0.83      0.71      0.77        21

    accuracy                           0.78        91
   macro avg       0.78      0.80      0.78        91
weighted avg       0.79      0.78      0.78        91



Challenge 1 : Repeating the Prediction using three different algorithms (Random Forest, SVM, Decision Tree)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

learner = RandomForestClassifier(random_state=42, n_estimators=1000)

# Train the learner with the training data
model = learner.fit(TRAIN_X, TRAIN_Y)  # TRAIN_X: features, TRAIN_Y: labels

# Make predictions on the test data
predictions = model.predict(TEST_X)  # TEST_X: test features

# Evaluate the predictions
accuracy = accuracy_score(TEST_Y, predictions)  # TEST_Y: true labels
print("Random Forest Accuracy:", accuracy)

# Classification report
report = classification_report(TEST_Y, predictions)
print("Classification Report:")
print(report)

Random Forest Accuracy: 0.9340659340659341
Classification Report:
              precision    recall  f1-score   support

           1       0.92      0.92      0.92        24
           2       1.00      1.00      1.00        13
           3       1.00      0.79      0.88        14
           4       0.90      0.95      0.92        19
           5       0.91      1.00      0.95        21

    accuracy                           0.93        91
   macro avg       0.95      0.93      0.93        91
weighted avg       0.94      0.93      0.93        91



SVM

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

learner = SVC(random_state=44, kernel='sigmoid', probability=True)

model = learner.fit(TRAIN_X, TRAIN_Y)

# Make predictions on the test data
predictions = model.predict(TEST_X)  # TEST_X: test features

# Evaluate the predictions
accuracy = accuracy_score(TEST_Y, predictions)  # TEST_Y: true labels
print("SVM Accuracy:", accuracy)

# Classification report
report = classification_report(TEST_Y, predictions)
print("Classification Report:")
print(report)

SVM Accuracy: 0.18681318681318682
Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        24
           2       0.00      0.00      0.00        13
           3       0.15      0.43      0.22        14
           4       0.20      0.53      0.29        19
           5       1.00      0.05      0.09        21

    accuracy                           0.19        91
   macro avg       0.27      0.20      0.12        91
weighted avg       0.30      0.19      0.12        91



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


After evaluating three different algorithms, Random Forest gives the highest accuracy with 0.93. Second being Decision tree with 0.78 accuracy and the algorithm with the least accuracy is SVM with 0.18. Hence Random Forest is the best performing algorithm

Challenge 2

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

learner = RandomForestClassifier(random_state=4, n_estimators=1000)

model = learner.fit(TRAIN_X2, TRAIN_Y2)  # TRAIN_X2: features, TRAIN_Y2: labels

predictions = model.predict(TEST_X2)  # TEST_X2: test features

# Evaluate the predictions
accuracy = accuracy_score(TEST_Y2, predictions)  # TEST_Y2: true labels
print("Random Forest Accuracy:", accuracy)

# Classification report
report = classification_report(TEST_Y2, predictions)
print("Classification Report:")
print(report)

Random Forest Accuracy: 0.945054945054945
Classification Report:
              precision    recall  f1-score   support

           1       0.96      0.96      0.96        24
           2       1.00      1.00      1.00        13
           3       1.00      0.93      0.96        14
           4       0.86      0.95      0.90        19
           5       0.95      0.90      0.93        21

    accuracy                           0.95        91
   macro avg       0.95      0.95      0.95        91
weighted avg       0.95      0.95      0.95        91



I repeated the prediction using a different feature set (Intensity Features) on the best performing algorithm (Random Forest). It has improved the prediction from 0.93 to 0.94 compared to using all features.

Challenge 3: Cross Validation

In [None]:
from sklearn.model_selection import cross_validate

scoring = ['accuracy']

cv_results = cross_validate(
    learner, TRAIN_X, TRAIN_Y,
    cv=5,  # Number of cross-validation folds
    scoring= scoring,
    return_train_score=True  # Get training scores as well
)

# Step 4: Display cross-validation results
print("Cross-Validation Results:")
print("Train Accuracy (mean):", cv_results['train_accuracy'].mean())
print("Test Accuracy (mean):", cv_results['test_accuracy'].mean())
print("Cross-Validation Details:")
print(cv_results)

Cross-Validation Results:
Train Accuracy (mean): 1.0
Test Accuracy (mean): 0.903538812785388
Cross-Validation Details:
{'fit_time': array([3.54212499, 2.62730765, 2.68502879, 2.66187644, 3.4333477 ]), 'score_time': array([0.07509661, 0.06975007, 0.0680151 , 0.07541633, 0.11781335]), 'test_accuracy': array([0.84931507, 0.87671233, 0.93055556, 0.93055556, 0.93055556]), 'train_accuracy': array([1., 1., 1., 1., 1.])}


Challenge 4: Grid search cross validation

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Assuming you already have X_train, X_test, y_train, y_test

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define parameter grids for different classifiers


param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


# Create classifiers
classifiers = {

    'Random Forest': (RandomForestClassifier(), param_grid_rf),

}

# Dictionary to store the best models
best_models = {}

# Perform grid search for each classifier
for name, (classifier, param_grid) in classifiers.items():
    print(f"\nPerforming grid search for {name}...")

    grid_search = GridSearchCV(
        estimator=classifier,
        param_grid=param_grid,
        cv=cv,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )

    # Fit grid search
    grid_search.fit(TRAIN_X, TRAIN_Y)

    # Get best model
    best_models[name] = grid_search.best_estimator_

    # Print results
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

    # Evaluate on test set
    y_pred = grid_search.predict(TEST_X)
    test_accuracy = accuracy_score(TEST_Y, y_pred)
    print(f"Test accuracy with best {name}: {test_accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(TEST_Y, y_pred))

# Find the overall best model
best_model_name = max(best_models, key=lambda name: accuracy_score(TEST_Y, best_models[name].predict(TEST_X)))
best_model = best_models[best_model_name]

print(f"\n\nThe best overall model is {best_model_name} with test accuracy: "
      f"{accuracy_score(TEST_Y, best_model.predict(TEST_X)):.4f}")



Performing grid search for Random Forest...
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters for Random Forest: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best cross-validation score: 0.9199
Test accuracy with best Random Forest: 0.9341

Classification Report:
              precision    recall  f1-score   support

           1       0.92      0.92      0.92        24
           2       1.00      1.00      1.00        13
           3       1.00      0.79      0.88        14
           4       0.90      0.95      0.92        19
           5       0.91      1.00      0.95        21

    accuracy                           0.93        91
   macro avg       0.95      0.93      0.93        91
weighted avg       0.94      0.93      0.93        91



The best overall model is Random Forest with test accuracy: 0.9341
