In [3]:
import os
import cv2
import numpy as np
import pandas as pd
from skimage import io, img_as_float
from skimage.filters import sobel
from scipy.stats import skew, kurtosis, entropy
import matplotlib.pyplot as plt

In [2]:
def load_gray(img_path,size=(512,512)):
    img=io.imread(img_path, as_gray=True)
    img=img_as_float(img)
    return cv2.resize(img,size, interpolation=cv2.INTER_AREA)

In [20]:
def extract_features(img,file_path,scanner_id="unknown"):
    h,w=img.shape
    aspect_ratio=w/h
    file_size_kb=os.path.getsize(file_path)/1024  # in KB
    pixels=img.flatten()
    mean_intensity=np.mean(pixels)
    std_intensity=np.std(pixels)
    skewness=skew(pixels)
    kurt=kurtosis(pixels)
    ent=entropy(np.histogram(pixels, bins=256, range=(0, 1))[0]+1e-6)
    edges=sobel(img)
    edge_density=np.mean(edges>0.1)
    return {
        "file_name": os.path.basename(file_path),
        "class_label": scanner_id,
        #"width": w,
        #"height": h,
        #"aspect_ratio": aspect_ratio,
        #"file_size_kb": file_size_kb,
        "mean_intensity": mean_intensity,
        "std_intensity": std_intensity,
        "skewness": skewness,
        "kurtosis": kurt,
        "entropy": ent,
        "edge_density": edge_density
    }


In [5]:
columns = [
    "file_name","class_label","width", "height", "aspect_ratio", "file_size_kb",
    "mean_intensity", "std_intensity", "skewness", "kurtosis",
    "entropy", "edge_density"
]

In [6]:
metadata = pd.DataFrame(columns=columns)

In [7]:
def process_image_and_add(df,img_path,scanner_id="unknown"):
    img=load_gray(img_path)
    features=extract_features(img,img_path,scanner_id)
    return pd.concat([df, pd.DataFrame([features])], ignore_index=True)

def process_image_and_append(df, img_path, scanner_id="unknown"):
    img = load_gray(img_path)
    features = extract_features(img, img_path, scanner_id)
    df.loc[len(df)] = features
    return df

In [14]:
dataset_dir = r"D:\jupyter\Datasets\Official"

In [21]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 225 entries, 0 to 224
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   file_name       225 non-null    object 
 1   class_label     225 non-null    object 
 2   width           225 non-null    int64  
 3   height          225 non-null    int64  
 4   aspect_ratio    225 non-null    float64
 5   file_size_kb    225 non-null    float64
 6   mean_intensity  225 non-null    float64
 7   std_intensity   225 non-null    float64
 8   skewness        225 non-null    float64
 9   kurtosis        225 non-null    float64
 10  entropy         225 non-null    float64
 11  edge_density    225 non-null    float64
dtypes: float64(8), int64(2), object(2)
memory usage: 22.9+ KB


In [24]:
metadata["class_label"].value_counts()

class_label
HP             53
EpsonV370-2    40
Canon120-1     24
EpsonV550      21
Canon120-2     19
EpsonV39-1     19
Canon9000-2    17
Canon220       15
EpsonV370-1    13
Canon9000-1     3
EpsonV39-2      1
Name: count, dtype: int64

In [22]:
metadata.head()

Unnamed: 0,file_name,class_label,width,height,aspect_ratio,file_size_kb,mean_intensity,std_intensity,skewness,kurtosis,entropy,edge_density
0,s1_13.tif,Canon120-1,512,512,1.0,6366.801758,0.980218,0.077358,-4.790764,25.914771,0.781967,0.109833
1,s1_19.tif,Canon120-1,512,512,1.0,6366.801758,0.971157,0.096124,-4.117721,17.967127,1.149003,0.153709
2,s1_22.tif,Canon120-1,512,512,1.0,6366.801758,0.993663,0.04602,-9.545255,105.365174,0.302918,0.035507
3,s1_23.tif,Canon120-1,512,512,1.0,6366.801758,0.986276,0.061183,-5.732387,36.103704,0.747618,0.068008
4,s1_26.tif,Canon120-1,512,512,1.0,6366.801758,0.984811,0.074763,-6.015534,40.108241,0.576405,0.074955


In [23]:
metadata.describe()

Unnamed: 0,width,height,aspect_ratio,file_size_kb,mean_intensity,std_intensity,skewness,kurtosis,entropy,edge_density
count,225.0,225.0,225.0,225.0,225.0,225.0,225.0,225.0,225.0,225.0
mean,512.0,512.0,1.0,17808.650699,0.964888,0.081251,-5.029062,29.972324,1.550351,0.096996
std,0.0,0.0,0.0,9374.9526,0.022817,0.020768,1.37511,17.076014,0.755877,0.044108
min,512.0,512.0,1.0,6275.959961,0.898533,0.044467,-9.545255,0.736546,0.302918,0.03199
25%,512.0,512.0,1.0,6366.801758,0.947987,0.063414,-5.846635,17.458369,0.871004,0.065777
50%,512.0,512.0,1.0,25352.668945,0.975061,0.080251,-5.031291,27.702308,1.398749,0.086563
75%,512.0,512.0,1.0,25495.071289,0.982437,0.093943,-4.117721,38.236931,2.308847,0.119751
max,512.0,512.0,1.0,25495.073242,0.993663,0.160209,-1.59527,105.365174,3.206845,0.251602


In [19]:
for root, dirs, files in os.walk(dataset_dir):
    for file in files:
        if file.lower().endswith(('.tif', '.jpg', '.png')):
            img_path = os.path.join(root, file)
            relative_path = os.path.relpath(img_path, dataset_dir)
            scanner_id = relative_path.split(os.sep)[0]
            
            print(f"Processing: {file} | Scanner ID: {scanner_id}")
            metadata = process_image_and_append(metadata, img_path, scanner_id)

Processing: s1_13.tif | Scanner ID: Canon120-1
Processing: s1_19.tif | Scanner ID: Canon120-1
Processing: s1_22.tif | Scanner ID: Canon120-1
Processing: s1_23.tif | Scanner ID: Canon120-1
Processing: s1_26.tif | Scanner ID: Canon120-1
Processing: s1_3.tif | Scanner ID: Canon120-1
Processing: s1_44.tif | Scanner ID: Canon120-1
Processing: s1_49.tif | Scanner ID: Canon120-1
Processing: s1_8.tif | Scanner ID: Canon120-1
Processing: s1_16.tif | Scanner ID: Canon120-1
Processing: s1_44.tif | Scanner ID: Canon120-1
Processing: s1_45.tif | Scanner ID: Canon120-1
Processing: s1_46.tif | Scanner ID: Canon120-1
Processing: s1_47.tif | Scanner ID: Canon120-1
Processing: s1_48.tif | Scanner ID: Canon120-1
Processing: s1_49.tif | Scanner ID: Canon120-1
Processing: s1_50.tif | Scanner ID: Canon120-1
Processing: s1_52.tif | Scanner ID: Canon120-1
Processing: s1_53.tif | Scanner ID: Canon120-1
Processing: s1_54.tif | Scanner ID: Canon120-1
Processing: s1_55.tif | Scanner ID: Canon120-1
Processing: s1_