In [1]:
import pandas as pd
import glob
import random
import os
import cv2
from sklearn.utils import shuffle
from numpy.random import seed
import numpy as np
from tensorflow.random import set_seed
import warnings
import matplotlib.pyplot as plt
%matplotlib inline

pd.options.display.max_colwidth = 100
seed(42)
random.seed(42)
os.environ['PYTHONHASHSEED'] = str(42)
set_seed(42)
warnings.filterwarnings('ignore')

img_size = 256
batch_size = 64
seed = 42

In [23]:
main_path = "../chest_xray/"

train_path = os.path.join(main_path,"train")
test_path=os.path.join(main_path,"test")
validation_path = os.path.join(main_path,"val")

train_normal = glob.glob(train_path+"/NORMAL/*.jpeg")
train_pneumonia = glob.glob(train_path+"/PNEUMONIA/*.jpeg")

val_normal = glob.glob(validation_path+"/NORMAL/*.jpeg")
val_pneumonia = glob.glob(validation_path+"/PNEUMONIA/*.jpeg")

test_normal = glob.glob(test_path+"/NORMAL/*.jpeg")
test_pneumonia = glob.glob(test_path+"/PNEUMONIA/*.jpeg")


#create list of paths
train_list = [x for x in train_normal]
train_list.extend([x for x in train_pneumonia])

test_list = [x for x in test_normal]
test_list.extend([x for x in test_pneumonia])

val_list = [x for x in val_normal]
val_list.extend([x for x in val_pneumonia])

In [None]:
dic_data = {"train":[train_normal, train_pneumonia], "test":[test_normal, test_pneumonia] , "val":[val_normal, val_pneumonia]}
for key, value in dic_data.items():
    classes = [0]*len(value[0])    # Normal : 0 and Pneunomia : 1
    classes.extend([1]*len(value[1]))
    images_path = [x for x in value[0]]
    images_path.extend([x for x in value[1]])
    data = []
    i = 0
    for img_path in images_path:
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)

        if img is None:
            continue
        # resize image shape
        img = cv2.resize(img, (img_size, img_size))
        #normalize image
        img = img/255.0

        data.append([img, classes[i]])
        i = i+1
    df = pd.DataFrame(data, columns=['image', 'label'])
    print(f"{key} shape : {df.shape}")
    df.to_pickle(f"{key}.pkl")

In [4]:
#read saved formated images
df_train  = pd.read_pickle(os.path.join(main_path,"train.pkl"))
df_test = pd.read_pickle(os.path.join(main_path,"test.pkl"))
df_val = pd.read_pickle(os.path.join(main_path,"val.pkl"))

### Reshape the loaded images

In [5]:
X_train, y_train = np.array([ img.reshape(img_size,img_size, 1) for img in df_train['image']]), np.array(df_train['label'])
X_train, y_train = shuffle(X_train, y_train, random_state=42)
X_val, y_val = np.array([ img.reshape(img_size,img_size, 1) for img in df_val['image']]), np.array(df_val['label'])
X_val, y_val = shuffle(X_val, y_val, random_state=42)
X_test, y_test = np.array([ img.reshape(img_size,img_size, 1) for img in df_test['image']]), np.array(df_test['label'])
X_test, y_test = shuffle(X_test, y_test, random_state=42)

#### Histogram

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Define class names (replace with your actual class names)
class_names = ['Normal', 'Pneumonia']

# Set style
sns.set_style("whitegrid")  # Clean grid background
plt.style.use('seaborn')  # Modern and professional style

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
labels = ['Train', 'Validation', 'Test']
list_df = [df_train, df_val, df_test]

# Define a colorblind-friendly color scheme
colors = ['#1f77b4', '#ff7f0e']  # Blue and orange

# Function to calculate and print class distribution percentages
def distribution_percentages(list_df, labels):
    for i, df in enumerate(list_df):
        class_counts = df['label'].value_counts(normalize=True) * 100
        print(f"{labels[i]} Class Distribution:")
        for label, percentage in zip(class_counts.index, class_counts.values):
            print(f"Class '{label}': {percentage:.2f}%")



# Plot bar plots for each dataset
for i, ax in enumerate(axes):
    # Count the number of samples in each class
    class_counts = list_df[i]['label'].value_counts().sort_index()
    
    # Create a bar plot
    bars = ax.bar(class_counts.index, class_counts.values, color=colors, edgecolor='black', linewidth=1.2)
    
    # Add labels and title
    ax.set_title(f'{labels[i]} (Total: {len(list_df[i])})', fontsize=16, fontweight='bold', pad=20)
    ax.set_xlabel('Class', fontsize=14, fontweight='bold')
    ax.set_ylabel('Number of Samples', fontsize=14, fontweight='bold')
    
    # Set x-ticks and labels
    ax.set_xticks(class_counts.index)
    ax.set_xticklabels(class_names, fontsize=14)
    
    # Add value labels on top of each bar
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2, height + 0.5, str(int(height)),
                ha='center', va='bottom', fontsize=14, fontweight='bold')
    
    # Add gridlines
    ax.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Set y-axis limit for better spacing
    ax.set_ylim(0, max(class_counts.values) + 2)

# Add a main title for the entire figure
fig.suptitle('Class Distribution Across Datasets', fontsize=20, fontweight='bold', y=1.05)

# Add a legend
handles = [plt.Rectangle((0, 0), 1, 1, color=colors[i], edgecolor='black') for i in range(len(class_names))]
fig.legend(handles, class_names, title='Classes', loc='upper right', bbox_to_anchor=(1.0, 1.0), fontsize=12, title_fontsize=14)

# Adjust layout
plt.tight_layout()

# Save the plot as a high-resolution image
plt.savefig('class_distribution_professional.png', dpi=300, bbox_inches='tight')

# Show the plot
plt.show()



### Build CNN model from keras

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.callbacks import ModelCheckpoint
import numpy as np
import visualkeras

# Define the model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(256, 256, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile the model with precision and recall metrics
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=[Precision(name='precision'), Recall(name='recall')])


# Visualize the model using visualkeras
visualkeras.layered_view(
    model,
    legend=True,  # Add a legend
    scale_xy=15,  # Scale the diagram
    scale_z=1,    # Control the height of 3D-like layers
    spacing=50,   # Add spacing between layers
    to_file='model_diagram.png'  # Save as a high-quality image
).show()

# Define the ModelCheckpoint callback
checkpoint = ModelCheckpoint('best_model.h5',
                             monitor='val_recall',
                             save_best_only=True,
                             mode='max',
                             verbose=1)

# Compute class weights
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}
print("Class Weights:", class_weights_dict)

# Train the model
history = model.fit(X_train, 
                    y_train,
                    batch_size=batch_size,
                    epochs=12,
                    validation_data=(X_val, y_val),
                    class_weight=class_weights_dict)
                    #callbacks=[checkpoint])

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import class_weight

# Predict probabilities
y_pred = model.predict(X_test)

# If y_test is one-hot encoded, convert it to 1D array
if y_test.ndim > 1:
    y_test = np.argmax(y_test, axis=1)

# Adjust threshold using ROC curve analysis
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
optimal_idx = np.argmax(tpr - fpr)  # Find the threshold that maximizes TPR - FPR
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal Threshold: {optimal_threshold}")

# Convert probabilities to binary predictions using the optimal threshold
y_pred_classes = (y_pred > optimal_threshold).astype(int)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_classes)
print("Confusion Matrix:\n", conf_matrix)

# Generate classification report
print(classification_report(y_test, y_pred_classes, target_names=['Normal', 'Pneumonia']))

# Plot heatmap for confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Normal', 'Pneumonia'], 
            yticklabels=['Normal', 'Pneumonia'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Compute and plot ROC curve
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Compute and plot Precision-Recall curve
precision, recall, _ = precision_recall_curve(y_test, y_pred)
pr_auc = auc(recall, precision)
plt.figure()
plt.plot(recall, precision, color='blue', lw=2, label=f'PR curve (area = {pr_auc:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.show()

# Address class imbalance using class weighting
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_test), y=y_test)
class_weights = dict(enumerate(class_weights))
print(f"Class Weights: {class_weights}")

# Retrain the model with class weights (if you have access to the training process)
# model.fit(X_train, y_train, class_weight=class_weights)


In [None]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy values
plt.figure(figsize=(12, 6))

# Plot loss
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Plot accuracy (if available)
plt.subplot(1, 2, 2)
plt.plot(history.history['precision'], label='Train Precision')  # Replace with your metric
plt.plot(history.history['val_precision'], label='Validation Precision')
plt.title('Model Precision')
plt.xlabel('Epoch')
plt.ylabel('Precision')
plt.legend()

plt.tight_layout()
plt.show()