# 2. Data Cleaning

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from pathlib import Path
import seaborn as sns
import plotly.express as px
import os
%matplotlib inline

# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# specifically for manipulating zipped images and getting numpy arrays of pixel values of images.
import cv2     

from PIL import Image

#image reader
from skimage import io

In [7]:
# accessing an image file from the dataset classes
sample_path='Flowers\Flowers\Babi\babi_1.jpg'
image = io.imread(sample_path)  

# plotting the original image
i, (im1) = plt.subplots(1)
i.set_figwidth(15)
im1.imshow(image)

FileNotFoundError: No such file: 'C:\Users\Admin\Desktop\Cancer_Diagnosis\Flowers\Flowers\Babiabi_1.jpg'

#### Skew Correction
While scanning or taking a picture of any document, it is possible that the scanned or captured image might be slightly skewed sometimes. For the better performance of the OCR, it is good to determine the skewness in image and correct it.

In [2]:
def deskew(image):
    co_ords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(co_ords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC,
    borderMode=cv2.BORDER_REPLICATE)
    return rotated

#### Noise Removal
This step removes the small dots/patches which have high intensity compared to the rest of the image for smoothening of the image. OpenCV’s fast Nl Means Denoising Coloured function can do that easily.

In [3]:
def remove_noise(image):
    return cv2.fastNlMeansDenoisingColored(image, None, 10, 10, 7, 15)

#### Gray Scale image
This process converts an image from other color spaces to shades of Gray. The colour varies between complete black and complete white. OpenCV’s cvtColor() function perform this task very easily.

In [4]:
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

#### Thresholding or Binarization
This step converts any colored image into a binary image that contains only two colors black and white. It is done by fixing a threshold (normally half of the pixel range 0-255, i.e., 127). The pixel value having greater than the threshold is converted into a white pixel else into a black pixel. To determine the threshold value according to the image Otsu’s Binarization and Adaptive Binarization can be a better choice. In OpenCV, this can be done as given.

In [5]:
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY +
    cv2.THRESH_OTSU) [1]

#### Detect corrupt image

It is crucial to detect corrupted images, as they can negatively impact the performance of a machine learning model or computer vision system. Images that have been corrupted may contain noise, artifacts, or other anomalies that can lead to misclassifications or output errors. Improve the accuracy and dependability of a model or system by detecting and removing these images from the dataset. In addition, corrupted images can cause biases in the model or system, which can lead to erroneous results or unjust decisions.

In [None]:
from pathlib import Path
import imageio.v2 as imageio

corrupted_image = list()
dataset_path = "Flowers/Flowers"
accu = 0

for root, dirs, files in os.walk(dataset_path):
    for name in dirs:
        print(os.path.join(root, name))
        for image_file in Path(os.path.join(root, name)).glob('*.jpg'):
          accu = accu + 1
          try :
              image = Image.open(image_file)
              image.show()
              img = imageio.imread(image_file)
              image.show()
              img = deskew(img)
              image.show()
              img = remove_noise(img)
              image.show()
              img = get_grayscale(img)
              image.show()
              norm_img = np.zeros((img.shape[0], img.shape[1]))
              img = cv2.normalize(img, norm_img, 0, 255, cv2.NORM_MINMAX)
              image.show()
            #   print(f'read {image_file}')
          except :
              print(f'Cannot read image {image_file}')
              corrupted_image.append(image_file)
print("Total number of images : ", accu)

Flowers/Flowers\Babi
Cannot read image Flowers\Flowers\Babi\babi_1.jpg
Cannot read image Flowers\Flowers\Babi\babi_10.jpg
Cannot read image Flowers\Flowers\Babi\babi_100.jpg
Cannot read image Flowers\Flowers\Babi\babi_101.jpg
Cannot read image Flowers\Flowers\Babi\babi_102.jpg
Cannot read image Flowers\Flowers\Babi\babi_103.jpg
Cannot read image Flowers\Flowers\Babi\babi_104.jpg
Cannot read image Flowers\Flowers\Babi\babi_105.jpg
Cannot read image Flowers\Flowers\Babi\babi_106.jpg
Cannot read image Flowers\Flowers\Babi\babi_107.jpg
Cannot read image Flowers\Flowers\Babi\babi_108.jpg
Cannot read image Flowers\Flowers\Babi\babi_109.jpg
Cannot read image Flowers\Flowers\Babi\babi_11.jpg
Cannot read image Flowers\Flowers\Babi\babi_110.jpg
Cannot read image Flowers\Flowers\Babi\babi_111.jpg
Cannot read image Flowers\Flowers\Babi\babi_112.jpg
Cannot read image Flowers\Flowers\Babi\babi_113.jpg
Cannot read image Flowers\Flowers\Babi\babi_114.jpg
Cannot read image Flowers\Flowers\Babi\babi_115

In [None]:
len(corrupted_image)

#### Normalization
This process changes the range of pixel intensity values. The purpose of performing normalization is to bring image to range that is normal to sense.

#### Image duplication detection

Duplicate image detection is essential for multiple reasons:

* <b>Reducing storage</b>: Storing duplicate images wastes storage space, and detecting and removing them can help reduce storage costs.

* <b>Improving efficiency</b>: Processing or analyzing duplicate images is inefficient and time-consuming. Removing duplicates can improve processing and analysis efficiency.

* <b>Enhancing accuracy</b>: Duplicate images can bias the results of image-based analysis, such as object detection or image classification. Removing duplicates can improve the accuracy of these analyses.

* <b>Maintaining data integrity</b>: Duplicates can lead to confusion and inconsistency in data, especially when dealing with large image datasets. Removing duplicates helps to maintain data integrity and consistency.

In [None]:
import imagehash
import glob

# Define a function to compute the hash of an image file
def compute_hash(filepath):
    with Image.open(filepath) as img:
        return str(imagehash.phash(img))

# Define a function to find and remove duplicated images
def remove_duplicates(rootdir):
    hashes = {}
    duplicated = []
    rootdir = glob.glob(rootdir)
    for folder in rootdir:
        print()
        print(folder)
        for image_dir in glob.glob(folder+'/*.jpg'):
            # Compute the hash of the image file
            file_hash = compute_hash(image_dir)
            # Check if this hash has already been seen
            file=os.path.basename(image_dir).split('/')[-1]
            if file_hash in hashes:
                # This file is a duplicate, so remove it
                os.remove(image_dir)
                print(f'Removed duplicate file: {file}')
                duplicated.append(file)
            else:
                # This file is not a duplicate, so remember its hash
                hashes[file_hash] = file
        print(r'Duplicated image in ',folder,' :',len(duplicated))

# Usage: specify the root directory to search for duplicates
remove_duplicates('Flowers/Flowers/*')