In [27]:
import os
import numpy as np
import pandas as pd
from keras.preprocessing import image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt

# Image Data Loading Function
def get_image_path_from_subfolders(image_name, root_folder):
    for root, dirs, files in os.walk(root_folder):
        if image_name in files:
            return os.path.join(root, image_name)
    return None  # Return None if image is not found

def prepare_data(image_names_non_cancer, image_paths_cancer, root_folder_non_cancer, root_folder_cancer):
    images = []
    labels = []
    
    # Process non-cancer images (label 0)
    for image_name in image_names_non_cancer:
        image_path = get_image_path_from_subfolders(image_name, root_folder_non_cancer)
        if image_path:
            img = image.load_img(image_path, target_size=(224, 224))
            img_array = image.img_to_array(img) / 255.0
            images.append(img_array)
            labels.append(0)  # Non-cancer label

    # Process cancer images (label 1)
    for image_path in image_paths_cancer:
        image_name = os.path.basename(image_path)
        img = image.load_img(image_path, target_size=(224, 224))
        img_array = image.img_to_array(img) / 255.0
        images.append(img_array)
        labels.append(1)  # Cancer label

    return np.array(images), np.array(labels)

# Load the non-cancer and cancer data
matched_df = pd.read_csv("matched_noncancer.csv")  # Replace with the correct path
merged_cancer = pd.read_csv("matched_cancer.csv")  # Replace with the correct path
# Define image directories and data
root_folder_non_cancer = r'C:\Users\kenza.chenni\Desktop\acıbademsana\non cancer'
root_folder_cancer = r'C:\Users\kenza.chenni\Desktop\acıbademsana\cancer'

# Assuming these dataframes are already created or loaded
image_names_non_cancer = matched_df['InputFileName'].astype(str).tolist()  # From matched_df
image_paths_cancer = merged_cancer['AbsolutePath'].astype(str).tolist()  # From merged_cancer

# Prepare image data and labels
images, labels = prepare_data(image_names_non_cancer, image_paths_cancer, root_folder_non_cancer, root_folder_cancer)

# Reshape the images array for the model
images = images.reshape(images.shape[0], -1)  # Flatten the images to 1D arrays (if required)

# Feature Selection Process (assuming you want to combine this with other features)
clinical_features = non_cancer_data[['Ca_LVI', 'Ca_Nekroz', 'Ca_HG', 'Ca_NG', 'Ca_CerSinir', 'Ca_Mfokal', 
                                     'Ca_CERB2', 'Ca_ER', 'Ca_PR', 'Ca_Ki67']]  # Your clinical data columns

ultrasound_features = cancer_data[['Ca_PR', 'Ca_Ki67', 'AbsolutePath', 'RelativePath', 'InputFileName_y', 
                                   'PatientBirthDate_y', 'PatientSex_y', 'PatientAge_y', 'ViewName', 'Note']]  # Adjust as necessary

# Combine clinical and ultrasound features
combined_features = pd.concat([clinical_features, ultrasound_features], axis=1)



  matched_df = pd.read_csv("matched_noncancer.csv")  # Replace with the correct path


KeyboardInterrupt: 

In [None]:
# Standardize features (Z-score normalization)
scaler = StandardScaler()
combined_features_scaled = scaler.fit_transform(combined_features)

# Merge image features with clinical and ultrasound features
features = np.concatenate([images, combined_features_scaled], axis=1)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Apply LASSO for feature selection
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_train, y_train)

# Get the indices of non-zero coefficients
lasso_selected_features = np.where(lasso.coef_ != 0)[0]

# Reduced feature set after LASSO
X_train_lasso = X_train[:, lasso_selected_features]
X_test_lasso = X_test[:, lasso_selected_features]

# LightGBM Model
lgbm_model = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metric='binary_error')



In [None]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'num_leaves': [31, 50],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 200]
}
grid_search = GridSearchCV(lgbm_model, param_grid, cv=5)
grid_search.fit(X_train_lasso, y_train)

# Best model after hyperparameter tuning
best_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(X_test_lasso)
y_pred_proba = best_model.predict_proba(X_test_lasso)[:, 1]

# Metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)
