In [11]:
import pandas as pd

file_path = 'final_multimodal_train.tsv'

# Load the data
data = pd.read_csv(file_path, sep='\t')

In [12]:
# balance of classes in this data
print("\nClass Balance:")
print(data['2_way_label'].value_counts())


Class Balance:
2_way_label
0    36432
1    23568
Name: count, dtype: int64


In [13]:
from PIL import Image, ImageChops, ImageEnhance
import os
from tqdm import tqdm

def ela_image(image_path, output_path, quality=75):
    try:
        # Open the original image
        original = Image.open(image_path)
        
        # Convert image to RGB mode if needed
        if original.mode != 'RGB':
            original = original.convert('RGB')
        
        # Save temporary compressed image
        temp_path = 'temp.jpg'
        original.save(temp_path, 'JPEG', quality=quality)
        
        # Open compressed image
        recompressed = Image.open(temp_path)
        
        # Calculate difference
        diff = ImageChops.difference(original, recompressed)
        
        # Scale the difference
        extrema = diff.getextrema()
        max_diff = max([ex[1] for ex in extrema])
        
        if max_diff == 0:
            # Create black image if no difference
            scaled = Image.new('RGB', diff.size, 0)
        else:
            # Scale the difference for visibility
            scale = 255.0 / max_diff
            scaled = ImageEnhance.Brightness(diff).enhance(scale)
        
        # Save ELA result
        scaled.save(output_path, 'JPEG')
        
        # Cleanup
        os.remove(temp_path)
        return True
        
    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return False

# Create output directory
output_dir = 'ela_images'
os.makedirs(output_dir, exist_ok=True)

In [14]:
# Process all images
successful = 0
failed = 0
skipped = 0

for idx, row in tqdm(data.iterrows(), total=len(data), desc="Processing images"):
    image_id = row['id']
    input_path = f'finalImages/{image_id}.jpg'
    output_path = f'{output_dir}/{image_id}_ela.jpg'
    
    if os.path.exists(output_path):
        skipped += 1
        continue
    
    if os.path.exists(input_path):
        if ela_image(input_path, output_path):
            successful += 1
        else:
            failed += 1
    else:
        failed += 1
        print(f"Image not found: {input_path}")

print(f"\nProcessing complete:")
print(f"Successful: {successful}")
print(f"Failed: {failed}")
print(f"Skipped: {skipped}")

Processing images:   1%|          | 460/60000 [00:00<00:12, 4584.60it/s]

Error processing finalImages/cii71pz.jpg: Truncated File Read
Error processing finalImages/cw74890.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\cw74890.jpg'
Error processing finalImages/dq4x5ki.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\dq4x5ki.jpg'


Processing images:   5%|▍         | 2810/60000 [00:00<00:10, 5452.49it/s]

Error processing finalImages/d1v7bwy.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\d1v7bwy.jpg'
Error processing finalImages/dcp93kl.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\dcp93kl.jpg'


Processing images:   6%|▋         | 3881/60000 [00:00<00:10, 5172.70it/s]

Error processing finalImages/c9l73ty.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\c9l73ty.jpg'


Processing images:  11%|█         | 6515/60000 [00:01<00:10, 5201.83it/s]

Error processing finalImages/dgd7lo5.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\dgd7lo5.jpg'
Error processing finalImages/c89lshc.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\c89lshc.jpg'


Processing images:  19%|█▉        | 11683/60000 [00:02<00:09, 5013.02it/s]

Error processing finalImages/cj6w3xw.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\cj6w3xw.jpg'


Processing images:  25%|██▌       | 15060/60000 [00:02<00:08, 5452.13it/s]

Error processing finalImages/chyyfga.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\chyyfga.jpg'
Error processing finalImages/e18t1mm.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\e18t1mm.jpg'


Processing images:  32%|███▏      | 18985/60000 [00:03<00:07, 5623.98it/s]

Error processing finalImages/cyi7u0c.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\cyi7u0c.jpg'


Processing images:  36%|███▌      | 21363/60000 [00:03<00:07, 5445.51it/s]

Error processing finalImages/cnth65g.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\cnth65g.jpg'
Error processing finalImages/f2zd5yw.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\f2zd5yw.jpg'


Processing images:  37%|███▋      | 22482/60000 [00:04<00:06, 5470.54it/s]

Error processing finalImages/cgx4u3c.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\cgx4u3c.jpg'
Error processing finalImages/cexapep.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\cexapep.jpg'


Processing images:  45%|████▌     | 27116/60000 [00:05<00:06, 5328.08it/s]

Error processing finalImages/c74xt4o.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\c74xt4o.jpg'
Error processing finalImages/cqhowgl.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\cqhowgl.jpg'


Processing images:  49%|████▉     | 29257/60000 [00:05<00:06, 5098.90it/s]

Error processing finalImages/ds6txmd.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\ds6txmd.jpg'
Error processing finalImages/dhiibiw.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\dhiibiw.jpg'


Processing images:  55%|█████▍    | 32801/60000 [00:06<00:04, 5874.11it/s]

Error processing finalImages/c814ydc.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\c814ydc.jpg'
Error processing finalImages/c5nrhpu.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\c5nrhpu.jpg'


Processing images:  57%|█████▋    | 34111/60000 [00:06<00:04, 6099.39it/s]

Error processing finalImages/cfxta61.jpg: image file is truncated (10 bytes not processed)
Error processing finalImages/f3ady8z.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\f3ady8z.jpg'
Error processing finalImages/csjp1ca.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\csjp1ca.jpg'


Processing images:  61%|██████    | 36325/60000 [00:06<00:03, 6944.39it/s]

Error processing finalImages/corzta4.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\corzta4.jpg'


Processing images:  64%|██████▍   | 38601/60000 [00:06<00:02, 7184.88it/s]

Error processing finalImages/dkuuac9.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\dkuuac9.jpg'
Error processing finalImages/c5qt713.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\c5qt713.jpg'
Error processing finalImages/d0gs39y.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\d0gs39y.jpg'
Error processing finalImages/cfgiw4f.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\cfgiw4f.jpg'
Error processing finalImages/dhu48yc.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\dhu48yc.jpg'


Processing images:  67%|██████▋   | 40086/60000 [00:07<00:02, 7204.62it/s]

Error processing finalImages/db8iwnp.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\db8iwnp.jpg'
Error processing finalImages/c88qgu5.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\c88qgu5.jpg'
Error processing finalImages/c9vamyt.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\c9vamyt.jpg'


Processing images:  73%|███████▎  | 43856/60000 [00:07<00:02, 7451.80it/s]

Error processing finalImages/cno4may.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\cno4may.jpg'


Processing images:  80%|███████▉  | 47774/60000 [00:08<00:01, 7603.16it/s]

Error processing finalImages/cct2nka.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\cct2nka.jpg'
Error processing finalImages/f32ocn5.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\f32ocn5.jpg'


Processing images:  85%|████████▍ | 50879/60000 [00:08<00:01, 7537.81it/s]

Error processing finalImages/e3q8b8r.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\e3q8b8r.jpg'
Error processing finalImages/c85nxki.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\c85nxki.jpg'


Processing images:  91%|█████████ | 54578/60000 [00:09<00:00, 6957.98it/s]

Error processing finalImages/cffoewa.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\cffoewa.jpg'


Processing images:  97%|█████████▋| 58243/60000 [00:09<00:00, 7087.05it/s]

Error processing finalImages/cfir0xe.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\cfir0xe.jpg'
Error processing finalImages/f0720jh.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\f0720jh.jpg'
Error processing finalImages/dltekd1.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\dltekd1.jpg'


Processing images: 100%|██████████| 60000/60000 [00:09<00:00, 6113.72it/s]

Error processing finalImages/dij5nzu.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\dij5nzu.jpg'
Error processing finalImages/ckxbzyd.jpg: cannot identify image file 'C:\\Users\\ASUS\\Desktop\\ml project stuff\\finalImages\\ckxbzyd.jpg'

Processing complete:
Successful: 0
Failed: 45
Skipped: 59955





In [15]:
import numpy as np
import cv2
from scipy import stats
from skimage.feature import graycomatrix, graycoprops
from scipy.stats import entropy
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [16]:
def extract_statistical_features(img):
    """Extract basic statistical features from image"""
    # Convert to grayscale if needed
    if len(img.shape) == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    mean = np.mean(img)
    std = np.std(img)
    skewness = stats.skew(img.flatten())
    kurtosis = stats.kurtosis(img.flatten())
    
    return {
        'mean': mean,
        'std': std,
        'skewness': skewness,
        'kurtosis': kurtosis
    }

def calculate_entropy(img):
    """Calculate image entropy"""
    if len(img.shape) == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    hist = cv2.calcHist([img], [0], None, [256], [0, 256])
    hist = hist / hist.sum()
    return -np.sum(hist * np.log2(hist + np.finfo(float).eps))

def extract_glcm_features(img):
    """Extract GLCM texture features"""
    if len(img.shape) == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Normalize image to fewer gray levels to reduce computation
    bins = 8
    img = img // (256 // bins)
    
    glcm = graycomatrix(img, distances=[1], angles=[0], levels=bins, symmetric=True, normed=True)
    
    contrast = graycoprops(glcm, 'contrast')[0][0]
    correlation = graycoprops(glcm, 'correlation')[0][0]
    energy = graycoprops(glcm, 'energy')[0][0]
    homogeneity = graycoprops(glcm, 'homogeneity')[0][0]
    
    return {
        'contrast': contrast,
        'correlation': correlation,
        'energy': energy,
        'homogeneity': homogeneity
    }

def extract_edge_features(img):
    """Extract edge-based features"""
    if len(img.shape) == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Sobel edges
    sobelx = cv2.Sobel(img, cv2.CV_64F, 1, 0, ksize=3)
    sobely = cv2.Sobel(img, cv2.CV_64F, 0, 1, ksize=3)
    edge_intensity = np.sqrt(sobelx**2 + sobely**2)
    
    # Canny edges
    edges = cv2.Canny(img, 100, 200)
    edge_count = np.count_nonzero(edges)
    
    return {
        'edge_intensity_mean': np.mean(edge_intensity),
        'edge_count': edge_count
    }

def extract_frequency_features(img):
    """Extract frequency domain features"""
    if len(img.shape) == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Apply FFT
    f_transform = np.fft.fft2(img)
    f_shift = np.fft.fftshift(f_transform)
    magnitude_spectrum = np.abs(f_shift)
    
    # Get high frequency energy
    rows, cols = img.shape
    center_row, center_col = rows//2, cols//2
    radius = min(center_row, center_col) // 2
    
    y, x = np.ogrid[-center_row:rows-center_row, -center_col:cols-center_col]
    mask = x*x + y*y <= radius*radius
    high_freq = magnitude_spectrum.copy()
    high_freq[mask] = 0
    
    return {
        'high_freq_energy': np.sum(high_freq)
    }

def extract_all_features(image_path):
    """Extract all features from an image"""
    try:
        img = cv2.imread(image_path)
        if img is None:
            return None
        
        features = {}
        
        # Extract all feature types
        features.update(extract_statistical_features(img))
        features['entropy'] = calculate_entropy(img)
        features.update(extract_glcm_features(img))
        features.update(extract_edge_features(img))
        features.update(extract_frequency_features(img))
        
        return features
    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

In [17]:
# Process all ELA images and extract features
features_list = []
image_ids = []

for idx, row in tqdm(data.iterrows(), total=len(data), desc="Extracting features"):
    image_id = row['id']
    ela_path = f'ela_images/{image_id}_ela.jpg'
    
    if os.path.exists(ela_path):
        features = extract_all_features(ela_path)
        if features is not None:
            features_list.append(features)
            image_ids.append(image_id)

  skewness = stats.skew(img.flatten())
  kurtosis = stats.kurtosis(img.flatten())
Extracting features: 100%|██████████| 60000/60000 [1:37:30<00:00, 10.26it/s]  


In [18]:
# Create DataFrame with features
feature_df = pd.DataFrame(features_list, index=image_ids)

# Scale features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(feature_df)
scaled_feature_df = pd.DataFrame(scaled_features, columns=feature_df.columns, index=feature_df.index)

# Save features
scaled_feature_df.to_csv('ela_features.csv')
print(f"Extracted features for {len(feature_df)} images")
print("\nFeature names:")
print(feature_df.columns.tolist())

Extracted features for 59955 images

Feature names:
['mean', 'std', 'skewness', 'kurtosis', 'entropy', 'contrast', 'correlation', 'energy', 'homogeneity', 'edge_intensity_mean', 'edge_count', 'high_freq_energy']
