In [None]:
import pandas as pd
from PIL import Image
import numpy as np
import shutil
import os

### Get Image information

In [None]:
def image_information(image_path):
    img = Image.open(image_path)
    img_gray = img.convert('L')
    img_array = np.array(img_gray)

    histogram, _ = np.histogram(img_array.flatten(), bins=256, range=(0, 255), density=True)
    histogram_nonzero = histogram[histogram > 0]
    entropy = -np.sum(histogram_nonzero * np.log2(histogram_nonzero))

    variance = np.var(img_array)
    mean_intensity = np.mean(img_array)

    contrast_difference = np.percentile(img_array, 95) - np.percentile(img_array, 5)

    return entropy, variance, mean_intensity, contrast_difference

### Classify Image information

In [None]:
def classify_image(entropy, variance, mean_intensity, contrast_difference):
    ENTROPY_THRESHOLD= 5.0
    VARIANCE_THRESHOLD=200.0
    INTENSITY_LOW_THRESHOLD=30
    INTENSITY_HIGH_THRESHOLD=250
    CONTRAST_DIFFERENCE_THRESHOLD = 20
    if entropy > ENTROPY_THRESHOLD and variance > VARIANCE_THRESHOLD and INTENSITY_LOW_THRESHOLD < mean_intensity < INTENSITY_HIGH_THRESHOLD or contrast_difference>= CONTRAST_DIFFERENCE_THRESHOLD:
        return 'high'
    return 'low'

### Data Loader
load the data file into the notebook using pandas.

In [None]:
# siemens_amberg_labeled
File_NAME_SIEMENS = "data/siemens_data_original.csv"
IMAGE_PATH_SIEMENS = "data/siemens_amberg_labeled/exactInspImage/"

# sehoaoi_labeled
File_NAME_SEHOAOI = "data/sehoaoi_data_original.csv"
IMAGE_PATH_SEHOAOI = "data/sehoaoi_labeled/exactInspImage/"

DATA_PATH= ""


In [None]:
df = pd.read_csv(DATA_PATH+File_NAME_SIEMENS)

df.info()

### Data Analysis


In [None]:
high_info_files = []
low_info_files = []
contrast_difference_list =[]
high_info_index = []

for image_index in df['imageIndex']:
    file_name = IMAGE_PATH_SIEMENS+str(image_index)+".png"  # Change the Image path
    try:
        entropy, variance, mean_intensity, contrast_difference = image_information(file_name)
        classification = classify_image(entropy, variance, mean_intensity, contrast_difference)
        if classification == 'high':
            high_info_index.append(image_index)
            high_info_files.append(file_name)
        else:
            low_info_files.append(file_name)
            contrast_difference_list.append(contrast_difference)
    except FileNotFoundError:
        print(f"File not found: {file_name}")
    except Exception as e:
        print(f"An error occurred while processing {file_name}: {e}")

print("High information files number:", len(high_info_files))
print("Low information files number:", len(low_info_files))

### Make copy of high and lowinformation image into new folder

In [None]:
def copy_files(file_list, target_folder):

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)
    
    for file_path in file_list:
        if os.path.isfile(file_path):
            target_file_path = os.path.join(target_folder, os.path.basename(file_path))
            shutil.copy2(file_path, target_file_path)
        else:
            print(f"The file {file_path} does not exist.")

In [None]:
HIGH_IMAGE_PATH = IMAGE_PATH_SIEMENS + "/high"
LOW_IMAGE_PATH = IMAGE_PATH_SIEMENS + "/low"

copy_files(high_info_files,HIGH_IMAGE_PATH)
copy_files(low_info_files,LOW_IMAGE_PATH)

### I need to clean up below code 

In [None]:
low_info_data = {
    'File': low_info_files,
    'Contrast Difference': contrast_difference_list
}

low_info_df = pd.DataFrame(low_info_data)

# Save DataFrame to CSV
csv_file_path = 'low_info_files_sdehoaoi.csv' 
low_info_df.to_csv(csv_file_path, index=False)

print(f"DataFrame saved to {csv_file_path}")

### Test on Low

In [None]:
file_name_low = LOW_IMAGE_PATH + "/4494.png"
entropy, variance, mean_intensity, contrast_difference = image_information(file_name_low)

print(file_name_low)
print(entropy)
print(variance)
print(mean_intensity)
print(contrast_difference)

### Test on high

In [None]:
file_name_high = HIGH_IMAGE_PATH + "/54.png"
entropy, variance, mean_intensity, contrast_difference = image_information(file_name_high)

print(file_name_high)
print(entropy)
print(variance)
print(mean_intensity)
print(contrast_difference)

### Delete all data

In [None]:
def delete_files_in_folder(folder_path):

    if not os.path.exists(folder_path):
        print(f"The folder {folder_path} does not exist.")
        return

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)  # Remove the file or link
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)  # Remove the directory and its contents
        except Exception as e:
            print(f"Failed to delete {file_path}. Reason: {e}")

In [None]:
delete_files_in_folder(HIGH_IMAGE_PATH)
delete_files_in_folder(LOW_IMAGE_PATH)

In [None]:
filtered_df = df[df['imageIndex'].isin(high_info_index)]
filtered_df.to_csv('sehoaoi_labeled_original_high.csv', index=False)
