<a href="https://colab.research.google.com/github/shreya-kamra/Breast-Cancer-Classification-based-on-Hybrid-Learning-using-Texture-Features-of-Ultrasound-Images/blob/main/texture_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import cv2
import skimage.feature
import pandas as pd
from google.colab import drive
import warnings
warnings.filterwarnings("ignore")

# Mount Google Drive
drive.mount('/content/drive')

# Define path to the dataset folder
dataset_folder = '/content/drive/My Drive/Dataset_BUSI_with_GT/'

# Function to extract texture features from an image
def extract_texture_features(image_path):
    # Read in the image and convert it to grayscale
    image = cv2.imread(image_path)
    image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Calculate co-occurrence matrix
    co_matrix = skimage.feature.greycomatrix(image_gray, [5], [0], levels=256, symmetric=True, normed=True)

    # Calculate texture features from the co-occurrence matrix
    contrast = skimage.feature.greycoprops(co_matrix, 'contrast')[0, 0]
    correlation = skimage.feature.greycoprops(co_matrix, 'correlation')[0, 0]
    energy = skimage.feature.greycoprops(co_matrix, 'energy')[0, 0]
    homogeneity = skimage.feature.greycoprops(co_matrix, 'homogeneity')[0, 0]

    return contrast, correlation, energy, homogeneity

# Create a DataFrame to store texture features and target labels
#texture_features_df = pd.DataFrame(columns=['Image', 'Contrast', 'Correlation', 'Energy', 'Homogeneity', 'Label'])

#Create a list to store texture features and target labels
texture_features_list = []

# Loop through subfolders
subfolders = ['malignant', 'normal', 'benign']
for label, subfolder in enumerate(subfolders):
    subfolder_path = os.path.join(dataset_folder, subfolder)

    # Loop through images in the subfolder
    for filename in os.listdir(subfolder_path):
        # Skip files with '_mask' in their names
         if filename.endswith(".png") and '_mask' not in filename:
            image_path = os.path.join(subfolder_path, filename)
            contrast, correlation, energy, homogeneity = extract_texture_features(image_path)
            texture_features_list.append((filename, contrast, correlation, energy, homogeneity, label))


texture_features_df = pd.DataFrame(texture_features_list, columns=['Image', 'Contrast', 'Correlation', 'Energy', 'Homogeneity', 'Label'])

# Save the DataFrame to CSV
csv_path = '/content/drive/My Drive/texture_features_with_labels.csv'
texture_features_df.to_csv(csv_path, index=False)

print("Texture features with labels extracted and saved to:", csv_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
/usr/local/lib/python3.10/dist-packages/skimage/feature/__init__.py:35: skimage_deprecation: Function ``greycomatrix`` is deprecated and will be removed in version 1.0. Use ``skimage.feature.graycomatrix`` instead.
  removed_version='1.0')
/usr/local/lib/python3.10/dist-packages/skimage/feature/__init__.py:42: skimage_deprecation: Function ``greycoprops`` is deprecated and will be removed in version 1.0. Use ``skimage.feature.graycoprops`` instead.
  removed_version='1.0')
/usr/local/lib/python3.10/dist-packages/skimage/feature/__init__.py:42: skimage_deprecation: Function ``greycoprops`` is deprecated and will be removed in version 1.0. Use ``skimage.feature.graycoprops`` instead.
  removed_version='1.0')
/usr/local/lib/python3.10/dist-packages/skimage/feature/__init__.py:42: skimage_deprecation: Function ``greycoprops`` is deprecated and will be removed in version 1.0. Use ``skimage.feature.graycoprops`` instead.
  remo

Texture features with labels extracted and saved to: /content/drive/My Drive/texture_features_with_labels.csv


In [None]:
import cv2
from skimage import morphology, measure
from skimage.measure import label, regionprops
import pandas as pd
import os

# Define the function to extract calcification features
def extract_calcification_features(mask):
    # Thresholding to identify calcifications
    calcification_mask = mask > 0

    # Morphological operations to enhance and clean the features
    calcification_mask = morphology.binary_closing(calcification_mask, morphology.disk(5))
    calcification_mask = morphology.binary_opening(calcification_mask, morphology.disk(3))

    label_image = label(calcification_mask)

    calcification_regions = regionprops(label_image)

    calcification_features = []
    if len(calcification_regions) > 0:
        for region in calcification_regions:
            area = region.area
            centroid = region.centroid
            perimeter_value = measure.perimeter(calcification_mask)
            calcification_features.append({
                'Area': area,
                'Centroid_x': centroid[0],  # Separate centroid x and y coordinates
                'Centroid_y': centroid[1],
                'Perimeter': perimeter_value,
            })
    else:
        # If no calcifications detected, return NaN values for the features
        calcification_features = [{
            'Area': float('NaN'),
            'Centroid_x': float('NaN'),
            'Centroid_y': float('NaN'),
            'Perimeter': float('NaN'),
        }]
    return calcification_features

existing_csv_path = '/content/drive/MyDrive/texture_features_with_labels.csv'
existing_df = pd.read_csv(existing_csv_path)

dataset_folders = ['/content/drive/MyDrive/Dataset_BUSI_with_GT/malignant', '/content/drive/MyDrive/Dataset_BUSI_with_GT/normal', '/content/drive/MyDrive/Dataset_BUSI_with_GT/benign']
for dataset_folder in dataset_folders:
    for filename in os.listdir(dataset_folder):
        if filename.endswith("_mask.png"):
            mask_path = os.path.join(dataset_folder, filename)
            mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)

            features = extract_calcification_features(mask)

            # Append features to the existing DataFrame
            image_name = filename.replace("_mask.png", ".png")  # Get the corresponding image name
            existing_df_row = existing_df[existing_df['Image'] == image_name]  # Find the row corresponding to the image
            existing_df_row_index = existing_df_row.index[0]
            if len(features) > 0:
                # If features are found, append them to the DataFrame
                for i, feature in enumerate(features):
                    existing_df.at[existing_df_row_index + i, 'Calcification_Area'] = feature['Area']
                    existing_df.at[existing_df_row_index + i, 'Calcification_Centroid_x'] = feature['Centroid_x']
                    existing_df.at[existing_df_row_index + i, 'Calcification_Centroid_y'] = feature['Centroid_y']
                    existing_df.at[existing_df_row_index + i, 'Calcification_Perimeter'] = feature['Perimeter']
            else:
                # If no features found, fill NaN values
                existing_df.at[existing_df_row_index, 'Calcification_Area'] = float('NaN')
                existing_df.at[existing_df_row_index, 'Calcification_Centroid_x'] = float('NaN')
                existing_df.at[existing_df_row_index, 'Calcification_Centroid_y'] = float('NaN')
                existing_df.at[existing_df_row_index, 'Calcification_Perimeter'] = float('NaN')

# Save the updated DataFrame to CSV
existing_df.to_csv(existing_csv_path, index=False)

print("Calcification features have been appended to the existing CSV file:", existing_csv_path)



Calcification features have been appended to the existing CSV file: /content/drive/MyDrive/texture_features_with_labels.csv


In [None]:
existing_df.head(600)

Unnamed: 0,Image,Contrast,Correlation,Energy,Homogeneity,Label,Calcification_Area,Calcification_Centroid_x,Calcification_Centroid_y,Calcification_Perimeter
0,malignant (1).png,168.528654,0.977749,0.011414,0.109667,0,91017.0,275.014305,223.585539,1387.259018
1,malignant (106).png,259.942381,0.956594,0.032470,0.147085,0,21888.0,228.334750,187.371208,840.097546
2,malignant (103).png,181.953234,0.983374,0.012843,0.117609,0,53435.0,226.765229,239.500066,1139.778787
3,malignant (100).png,187.100132,0.973782,0.013606,0.127457,0,15940.0,130.126913,277.603513,646.600072
4,malignant (105).png,247.273520,0.956070,0.012725,0.111650,0,7351.0,105.212080,526.364848,390.492424
...,...,...,...,...,...,...,...,...,...,...
595,benign (326).png,126.162526,0.983248,0.315737,0.422720,2,47091.0,197.012847,267.388461,843.193001
596,benign (330).png,130.490677,0.970565,0.054548,0.202786,2,51972.0,206.534057,332.419630,921.595021
597,benign (327).png,194.793971,0.970938,0.037299,0.153289,2,34285.0,163.785096,137.835817,703.428499
598,benign (33).png,244.623024,0.961214,0.037430,0.145697,2,15492.0,114.848632,133.765879,494.717821


In [None]:
import cv2
from skimage import util, color, morphology
from skimage.feature import greycomatrix, greycoprops
import pandas as pd
import os
import numpy as np


# Define the function to extract echo pattern features
def extract_echo_pattern_features(image, mask):
    gray_image = util.img_as_ubyte(color.rgb2gray(image))

    masked_gray_image = cv2.bitwise_and(gray_image, gray_image, mask=mask)

    distances = [1, 2, 3]
    angles = [0, np.pi/4, np.pi/2, 3*np.pi/4]
    co_matrix = greycomatrix(masked_gray_image, distances=distances, angles=angles, symmetric=True, normed=True)
    energy = greycoprops(co_matrix, prop='energy').ravel()[0]
    homogeneity = greycoprops(co_matrix, prop='homogeneity').ravel()[0]

    return {
        'Echo_Pattern_Energy': energy,
        'Echo_Pattern_Homogeneity': homogeneity,
    }

existing_csv_path = '/content/drive/MyDrive/texture_features_with_labels.csv'
existing_df = pd.read_csv(existing_csv_path)

dataset_folders = ['/content/drive/MyDrive/Dataset_BUSI_with_GT/malignant', '/content/drive/MyDrive/Dataset_BUSI_with_GT/normal', '/content/drive/MyDrive/Dataset_BUSI_with_GT/benign']
for dataset_folder in dataset_folders:
    for filename in os.listdir(dataset_folder):
        if filename.endswith(".png"):
            image_path = os.path.join(dataset_folder, filename)
            image = cv2.imread(image_path)
            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            mask_filename = filename.replace(".png", "_mask.png")
            mask_path = os.path.join(dataset_folder, mask_filename)
            mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)

            features = extract_echo_pattern_features(image_rgb, mask)

            existing_df_row = existing_df[existing_df['Image'] == filename]
            if not existing_df_row.empty:
                existing_df_row_index = existing_df_row.index[0]
                existing_df.at[existing_df_row_index, 'Echo_Pattern_Energy'] = features['Echo_Pattern_Energy']
                existing_df.at[existing_df_row_index, 'Echo_Pattern_Homogeneity'] = features['Echo_Pattern_Homogeneity']

existing_df.to_csv(existing_csv_path, index=False)

print("Echo pattern features have been appended to the existing CSV file:", existing_csv_path)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
/usr/local/lib/python3.10/dist-packages/skimage/feature/__init__.py:42: skimage_deprecation: Function ``greycoprops`` is deprecated and will be removed in version 1.0. Use ``skimage.feature.graycoprops`` instead.
  removed_version='1.0')
/usr/local/lib/python3.10/dist-packages/skimage/feature/__init__.py:35: skimage_deprecation: Function ``greycomatrix`` is deprecated and will be removed in version 1.0. Use ``skimage.feature.graycomatrix`` instead.
  removed_version='1.0')
/usr/local/lib/python3.10/dist-packages/skimage/feature/__init__.py:42: skimage_deprecation: Function ``greycoprops`` is deprecated and will be removed in version 1.0. Use ``skimage.feature.graycoprops`` instead.
  removed_version='1.0')
/usr/local/lib/python3.10/dist-packages/skimage/feature/__init__.py:42: skimage_deprecation: Function ``greycoprops`` is deprecated and will be removed in version 1.0. Use ``skimage.feature.graycoprops`` instead.
  remo

Echo pattern features have been appended to the existing CSV file: /content/drive/MyDrive/texture_features_with_labels.csv


In [None]:
existing_df.head(600)

Unnamed: 0,Image,Contrast,Correlation,Energy,Homogeneity,Label,Calcification_Area,Calcification_Centroid_x,Calcification_Centroid_y,Calcification_Perimeter,Echo_Pattern_Energy,Echo_Pattern_Homogeneity
0,malignant (1).png,168.528654,0.977749,0.011414,0.109667,0,91017.0,275.014305,223.585539,1387.259018,0.659144,0.768960
1,malignant (106).png,259.942381,0.956594,0.032470,0.147085,0,21888.0,228.334750,187.371208,840.097546,0.927736,0.946935
2,malignant (103).png,181.953234,0.983374,0.012843,0.117609,0,53435.0,226.765229,239.500066,1139.778787,0.802370,0.866158
3,malignant (100).png,187.100132,0.973782,0.013606,0.127457,0,15940.0,130.126913,277.603513,646.600072,0.939390,0.956880
4,malignant (105).png,247.273520,0.956070,0.012725,0.111650,0,7351.0,105.212080,526.364848,390.492424,0.971694,0.978992
...,...,...,...,...,...,...,...,...,...,...,...,...
595,benign (326).png,126.162526,0.983248,0.315737,0.422720,2,47091.0,197.012847,267.388461,843.193001,0.902887,0.981167
596,benign (330).png,130.490677,0.970565,0.054548,0.202786,2,51972.0,206.534057,332.419630,921.595021,0.912734,0.956862
597,benign (327).png,194.793971,0.970938,0.037299,0.153289,2,34285.0,163.785096,137.835817,703.428499,0.916345,0.970904
598,benign (33).png,244.623024,0.961214,0.037430,0.145697,2,15492.0,114.848632,133.765879,494.717821,0.941429,0.978119


In [None]:
existing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 780 entries, 0 to 779
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Image                     780 non-null    object 
 1   Contrast                  780 non-null    float64
 2   Correlation               780 non-null    float64
 3   Energy                    780 non-null    float64
 4   Homogeneity               780 non-null    float64
 5   Label                     780 non-null    int64  
 6   Calcification_Area        647 non-null    float64
 7   Calcification_Centroid_x  647 non-null    float64
 8   Calcification_Centroid_y  647 non-null    float64
 9   Calcification_Perimeter   647 non-null    float64
 10  Echo_Pattern_Energy       780 non-null    float64
 11  Echo_Pattern_Homogeneity  780 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 73.2+ KB


In [None]:
import cv2
from skimage import util, color
import numpy as np
import pandas as pd
import os

# Define the function to extract shape features
def extract_shape_features(image, mask):
    gray_image = util.img_as_ubyte(color.rgb2gray(image))
    masked_gray_image = cv2.bitwise_and(gray_image, gray_image, mask=mask)

    # Thresholding to identify contours
    _, binary_image = cv2.threshold(masked_gray_image, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)

    # Find contours
    contours, _ = cv2.findContours(binary_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    shape_features = []
    for contour in contours:
        perimeter_value = cv2.arcLength(contour, True)
        area = cv2.contourArea(contour)
        if area > 0:
            circularity = (4 * np.pi * area) / (perimeter_value ** 2)
            shape_features.append({
                'Perimeter': perimeter_value,
                'Area': area,
                'Circularity': circularity,
            })

    return shape_features

existing_csv_path = '/content/drive/MyDrive/texture_features_with_labels.csv'
existing_df = pd.read_csv(existing_csv_path)

dataset_folders = ['/content/drive/MyDrive/Dataset_BUSI_with_GT/malignant', '/content/drive/MyDrive/Dataset_BUSI_with_GT/normal', '/content/drive/MyDrive/Dataset_BUSI_with_GT/benign']
for dataset_folder in dataset_folders:
    for filename in os.listdir(dataset_folder):
        if filename.endswith(".png"):
            image_path = os.path.join(dataset_folder, filename)
            image = cv2.imread(image_path)
            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            mask_filename = filename.replace(".png", "_mask.png")
            mask_path = os.path.join(dataset_folder, mask_filename)
            mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)

            features = extract_shape_features(image_rgb, mask)

            existing_df_row = existing_df[existing_df['Image'] == filename]
            if not existing_df_row.empty:
                existing_df_row_index = existing_df_row.index[0]
                if len(features) > 0:
                    for i, feature in enumerate(features):
                        existing_df.at[existing_df_row_index + i, 'Shape_Perimeter'] = feature['Perimeter']
                        existing_df.at[existing_df_row_index + i, 'Shape_Area'] = feature['Area']
                        existing_df.at[existing_df_row_index + i, 'Shape_Circularity'] = feature['Circularity']
                else:
                    existing_df.at[existing_df_row_index, 'Shape_Perimeter'] = float('NaN')
                    existing_df.at[existing_df_row_index, 'Shape_Area'] = float('NaN')
                    existing_df.at[existing_df_row_index, 'Shape_Circularity'] = float('NaN')

existing_df.to_csv(existing_csv_path, index=False)

print("Shape features have been appended to the existing CSV file:", existing_csv_path)


Shape features have been appended to the existing CSV file: /content/drive/MyDrive/texture_features_with_labels.csv


In [None]:
existing_df.head(831)

Unnamed: 0,Image,Contrast,Correlation,Energy,Homogeneity,Label,Calcification_Area,Calcification_Centroid_x,Calcification_Centroid_y,Calcification_Perimeter,Echo_Pattern_Energy,Echo_Pattern_Homogeneity,Shape_Perimeter,Shape_Area,Shape_Circularity
0,malignant (1).png,168.528654,0.977749,0.011414,0.109667,0.0,91017.0,275.014305,223.585539,1387.259018,0.659144,0.768960,1416.288445,90327.0,0.565880
1,malignant (106).png,259.942381,0.956594,0.032470,0.147085,0.0,21888.0,228.334750,187.371208,840.097546,0.927736,0.946935,29.899495,29.5,0.414671
2,malignant (103).png,181.953234,0.983374,0.012843,0.117609,0.0,53435.0,226.765229,239.500066,1139.778787,0.802370,0.866158,8.828427,4.0,0.644916
3,malignant (100).png,187.100132,0.973782,0.013606,0.127457,0.0,15940.0,130.126913,277.603513,646.600072,0.939390,0.956880,6.828427,1.0,0.269506
4,malignant (105).png,247.273520,0.956070,0.012725,0.111650,0.0,7351.0,105.212080,526.364848,390.492424,0.971694,0.978992,456.534051,6967.0,0.420058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
826,,,,,,,,,,,,,81.112698,40.0,0.076400
827,,,,,,,,,,,,,5.414214,1.5,0.643029
828,,,,,,,,,,,,,3.414214,0.5,0.539012
829,,,,,,,,,,,,,10.242641,4.5,0.539012


In [None]:
existing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 831 entries, 0 to 830
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Image                     780 non-null    object 
 1   Contrast                  780 non-null    float64
 2   Correlation               780 non-null    float64
 3   Energy                    780 non-null    float64
 4   Homogeneity               780 non-null    float64
 5   Label                     780 non-null    float64
 6   Calcification_Area        647 non-null    float64
 7   Calcification_Centroid_x  647 non-null    float64
 8   Calcification_Centroid_y  647 non-null    float64
 9   Calcification_Perimeter   647 non-null    float64
 10  Echo_Pattern_Energy       780 non-null    float64
 11  Echo_Pattern_Homogeneity  780 non-null    float64
 12  Shape_Perimeter           698 non-null    float64
 13  Shape_Area                698 non-null    float64
 14  Shape_Circ

In [None]:
new_df = existing_df.iloc[:780]

print(new_df)

                   Image    Contrast  Correlation    Energy  Homogeneity  \
0      malignant (1).png  168.528654     0.977749  0.011414     0.109667   
1    malignant (106).png  259.942381     0.956594  0.032470     0.147085   
2    malignant (103).png  181.953234     0.983374  0.012843     0.117609   
3    malignant (100).png  187.100132     0.973782  0.013606     0.127457   
4    malignant (105).png  247.273520     0.956070  0.012725     0.111650   
..                   ...         ...          ...       ...          ...   
775      benign (89).png  148.750534     0.978584  0.013503     0.126708   
776      benign (97).png  327.499456     0.944948  0.013604     0.113392   
777      benign (98).png  207.235856     0.975809  0.024137     0.137413   
778      benign (96).png  131.263517     0.983104  0.014620     0.135780   
779      benign (99).png  184.088464     0.962374  0.018835     0.136416   

     Label  Calcification_Area  Calcification_Centroid_x  \
0      0.0             9101

In [None]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 780 entries, 0 to 779
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Image                     780 non-null    object 
 1   Contrast                  780 non-null    float64
 2   Correlation               780 non-null    float64
 3   Energy                    780 non-null    float64
 4   Homogeneity               780 non-null    float64
 5   Label                     780 non-null    float64
 6   Calcification_Area        647 non-null    float64
 7   Calcification_Centroid_x  647 non-null    float64
 8   Calcification_Centroid_y  647 non-null    float64
 9   Calcification_Perimeter   647 non-null    float64
 10  Echo_Pattern_Energy       780 non-null    float64
 11  Echo_Pattern_Homogeneity  780 non-null    float64
 12  Shape_Perimeter           647 non-null    float64
 13  Shape_Area                647 non-null    float64
 14  Shape_Circ

In [None]:
# Replace NaN values with 1
filled_df = new_df.fillna(1)

print(filled_df)


                   Image    Contrast  Correlation    Energy  Homogeneity  \
0      malignant (1).png  168.528654     0.977749  0.011414     0.109667   
1    malignant (106).png  259.942381     0.956594  0.032470     0.147085   
2    malignant (103).png  181.953234     0.983374  0.012843     0.117609   
3    malignant (100).png  187.100132     0.973782  0.013606     0.127457   
4    malignant (105).png  247.273520     0.956070  0.012725     0.111650   
..                   ...         ...          ...       ...          ...   
775      benign (89).png  148.750534     0.978584  0.013503     0.126708   
776      benign (97).png  327.499456     0.944948  0.013604     0.113392   
777      benign (98).png  207.235856     0.975809  0.024137     0.137413   
778      benign (96).png  131.263517     0.983104  0.014620     0.135780   
779      benign (99).png  184.088464     0.962374  0.018835     0.136416   

     Label  Calcification_Area  Calcification_Centroid_x  \
0      0.0             9101

In [None]:
filled_df.head(10)

Unnamed: 0,Image,Contrast,Correlation,Energy,Homogeneity,Label,Calcification_Area,Calcification_Centroid_x,Calcification_Centroid_y,Calcification_Perimeter,Echo_Pattern_Energy,Echo_Pattern_Homogeneity,Shape_Perimeter,Shape_Area,Shape_Circularity
0,malignant (1).png,168.528654,0.977749,0.011414,0.109667,0.0,91017.0,275.014305,223.585539,1387.259018,0.659144,0.76896,1416.288445,90327.0,0.56588
1,malignant (106).png,259.942381,0.956594,0.03247,0.147085,0.0,21888.0,228.33475,187.371208,840.097546,0.927736,0.946935,29.899495,29.5,0.414671
2,malignant (103).png,181.953234,0.983374,0.012843,0.117609,0.0,53435.0,226.765229,239.500066,1139.778787,0.80237,0.866158,8.828427,4.0,0.644916
3,malignant (100).png,187.100132,0.973782,0.013606,0.127457,0.0,15940.0,130.126913,277.603513,646.600072,0.93939,0.95688,6.828427,1.0,0.269506
4,malignant (105).png,247.27352,0.95607,0.012725,0.11165,0.0,7351.0,105.21208,526.364848,390.492424,0.971694,0.978992,456.534051,6967.0,0.420058
5,malignant (10).png,445.093925,0.902922,0.011986,0.097324,0.0,5470.0,73.649177,259.101645,316.551299,0.969367,0.974991,335.722869,5316.5,0.592753
6,malignant (102).png,131.19931,0.94364,0.055981,0.20469,0.0,63813.0,292.909611,381.686381,1350.170706,0.873585,0.932959,26.485281,9.0,0.161229
7,malignant (101).png,120.721238,0.952769,0.048082,0.198075,0.0,68632.0,241.896754,433.763361,1599.42554,0.86102,0.927267,9.656854,3.0,0.404259
8,malignant (104).png,204.317548,0.97382,0.011389,0.105749,0.0,47527.0,238.755192,279.635849,1323.217388,0.825661,0.880982,25.313708,19.0,0.372608
9,malignant (107).png,227.810673,0.96029,0.013308,0.120314,0.0,31848.0,188.893745,273.272199,933.092496,0.87763,0.913519,12.485281,7.0,0.564302


In [None]:
df1=filled_df.drop(columns=['Image'])

In [None]:
df1.head(10)

Unnamed: 0,Contrast,Correlation,Energy,Homogeneity,Label,Calcification_Area,Calcification_Centroid_x,Calcification_Centroid_y,Calcification_Perimeter,Echo_Pattern_Energy,Echo_Pattern_Homogeneity,Shape_Perimeter,Shape_Area,Shape_Circularity
0,168.528654,0.977749,0.011414,0.109667,0.0,91017.0,275.014305,223.585539,1387.259018,0.659144,0.76896,1416.288445,90327.0,0.56588
1,259.942381,0.956594,0.03247,0.147085,0.0,21888.0,228.33475,187.371208,840.097546,0.927736,0.946935,29.899495,29.5,0.414671
2,181.953234,0.983374,0.012843,0.117609,0.0,53435.0,226.765229,239.500066,1139.778787,0.80237,0.866158,8.828427,4.0,0.644916
3,187.100132,0.973782,0.013606,0.127457,0.0,15940.0,130.126913,277.603513,646.600072,0.93939,0.95688,6.828427,1.0,0.269506
4,247.27352,0.95607,0.012725,0.11165,0.0,7351.0,105.21208,526.364848,390.492424,0.971694,0.978992,456.534051,6967.0,0.420058
5,445.093925,0.902922,0.011986,0.097324,0.0,5470.0,73.649177,259.101645,316.551299,0.969367,0.974991,335.722869,5316.5,0.592753
6,131.19931,0.94364,0.055981,0.20469,0.0,63813.0,292.909611,381.686381,1350.170706,0.873585,0.932959,26.485281,9.0,0.161229
7,120.721238,0.952769,0.048082,0.198075,0.0,68632.0,241.896754,433.763361,1599.42554,0.86102,0.927267,9.656854,3.0,0.404259
8,204.317548,0.97382,0.011389,0.105749,0.0,47527.0,238.755192,279.635849,1323.217388,0.825661,0.880982,25.313708,19.0,0.372608
9,227.810673,0.96029,0.013308,0.120314,0.0,31848.0,188.893745,273.272199,933.092496,0.87763,0.913519,12.485281,7.0,0.564302


In [None]:
#import pandas as pd
#df = pd.read_csv('dataset.csv')

In [None]:
#df1=df.drop(columns=['Image'])

Random Forest

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np
import pandas as pd
import os

X = df1.drop(columns=['Label'])
y = df1['Label']

rng = np.random.default_rng(seed=42)  # Set a seed for reproducibility
permutation = rng.permutation(len(X))
X = X.iloc[permutation]
y = y.iloc[permutation]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the input features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Define the hyperparameters grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize Random Forest classifier
clf = RandomForestClassifier(random_state=42)

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1,verbose=3)
grid_search.fit(X_train, y_train)

# Get the best model and its parameters
best_clf = grid_search.best_estimator_
best_params = grid_search.best_params_

# Train the best model on the entire training set
best_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_clf.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
Accuracy: 0.8589743589743589


Support Vector Machine

In [None]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.01 ,0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto', 0.1, 1, 10]
}

# Instantiate the SVM classifier
clf = svm.SVC(probability=True)

grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1,verbose=3)
grid_search.fit(X_train, y_train)

best_clf = grid_search.best_estimator_
best_params = grid_search.best_params_

best_clf.fit(X_train, y_train)

y_pred = best_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy: 0.8589743589743589


XGBoost

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
# to subpress warning
import warnings
warnings.filterwarnings("ignore")

# Define the XGBoost classifier
clf = XGBClassifier()

param_grid = {
    'n_estimators': [200, 250, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.0001, 0.001, 0.1],
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy',verbose=3)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END learning_rate=0.0001, max_depth=3, n_estimators=200;, score=0.856 total time=   0.3s
[CV 2/5] END learning_rate=0.0001, max_depth=3, n_estimators=200;, score=0.776 total time=   0.3s
[CV 3/5] END learning_rate=0.0001, max_depth=3, n_estimators=200;, score=0.736 total time=   0.4s
[CV 4/5] END learning_rate=0.0001, max_depth=3, n_estimators=200;, score=0.776 total time=   0.3s
[CV 5/5] END learning_rate=0.0001, max_depth=3, n_estimators=200;, score=0.815 total time=   0.3s
[CV 1/5] END learning_rate=0.0001, max_depth=3, n_estimators=250;, score=0.856 total time=   0.4s
[CV 2/5] END learning_rate=0.0001, max_depth=3, n_estimators=250;, score=0.776 total time=   0.4s
[CV 3/5] END learning_rate=0.0001, max_depth=3, n_estimators=250;, score=0.752 total time=   0.4s
[CV 4/5] END learning_rate=0.0001, max_depth=3, n_estimators=250;, score=0.776 total time=   0.4s
[CV 5/5] END learning_rate=0.0001, max_depth=3, n_estima

In [None]:
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Assuming X_train, X_test, y_train, and y_test are your training and testing data
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate XGBoost classifier
clf = XGBClassifier()

# Fit the model
clf.fit(X_train, y_train)

# Get feature importances
importances = clf.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Select features based on feature importances
threshold = 0.02  # Define your threshold for selecting features
sfm = SelectFromModel(clf, threshold=threshold)
X_train_selected = sfm.fit_transform(X_train, y_train)
X_test_selected = sfm.transform(X_test)

# Train XGBoost with selected features
clf_selected = XGBClassifier()
clf_selected.fit(X_train_selected, y_train)

# Make predictions
y_pred_selected = clf_selected.predict(X_test_selected)

# Evaluate accuracy
accuracy_selected = accuracy_score(y_test, y_pred_selected)
print("Accuracy with selected features:", accuracy_selected)

Feature ranking:
1. feature 10 (0.402026)
2. feature 7 (0.144457)
3. feature 6 (0.106930)
4. feature 9 (0.080017)
5. feature 4 (0.054959)
6. feature 0 (0.038057)
7. feature 5 (0.033552)
8. feature 12 (0.030447)
9. feature 8 (0.026632)
10. feature 1 (0.025687)
11. feature 3 (0.022236)
12. feature 2 (0.019752)
13. feature 11 (0.015247)
Accuracy with selected features: 0.8653846153846154


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Create a pipeline
pipeline = Pipeline([
    ('feature_selection', SelectFromModel(RandomForestClassifier())),
    ('classifier', XGBClassifier())
])

# Define the parameter grid for both feature selection and hyperparameter tuning
param_grid = {
    'feature_selection__threshold': [0.02],  # Threshold for feature selection
    'classifier__n_estimators': [200],
    'classifier__max_depth': [7],
    'classifier__learning_rate': [0.1]
}

# Perform GridSearchCV to find the best combination of hyperparameters and selected features
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model on the testing set
accuracy = best_model.score(X_test, y_test)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)

Best Parameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 7, 'classifier__n_estimators': 200, 'feature_selection__threshold': 0.02}
Accuracy: 0.8717948717948718


LSTM

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM

# Reshape the input data for LSTM
X_train_lstm = X_train.to_numpy().reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.to_numpy().reshape((X_test.shape[0], 1, X_test.shape[1]))

# Define the LSTM model
model = Sequential([
    LSTM(64, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]), return_sequences=True),
    LSTM(32),
    Dense(3, activation='softmax')  # Three classes
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_lstm, y_train, epochs=25, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_lstm, y_test)
print("Test Accuracy:", accuracy)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Test Accuracy: 0.7179487347602844
