In [None]:
import os
import sys
import joblib
import math
import numpy as np
import pandas as pd
from collections import Counter

In [None]:
# --- Model selection ---
selection = int(input("Choose a Model\n1. Decision Tree\n2. Random Forest\n"))
m = ""
le_ext = None  # Default if not used

if selection == 1:
    saved = joblib.load("DecisionTree.joblib")
    m = "Decision Tree"
    le_ext = saved['le_ext']
elif selection == 2:
    saved = joblib.load("RandomForest.joblib")
    m = "Random Forest"
else:
    print("Invalid selection.")
    sys.exit()

model = saved['model']
le_target = saved['le_target']

In [None]:
# --- Feature extraction ---
def compute_entropy(byte_data):
    byte_counts = Counter(byte_data)
    total = len(byte_data)
    return -sum((count / total) * math.log2(count / total) for count in byte_counts.values())

def extract_features_from_file(file_path, le_ext=None):
    with open(file_path, "rb") as f:
        byte_data = f.read()

    file_size = os.path.getsize(file_path)
    entropy = compute_entropy(byte_data)

    try:
        text = byte_data.decode("utf-8", errors="ignore")
        lines = text.splitlines()
        avg_line_length = np.mean([len(line) for line in lines]) if lines else 0
        column_count = np.mean([line.count(",") + 1 for line in lines if "," in line]) if lines else 0
    except:
        avg_line_length = 0
        column_count = 0

    bit_depth = 8

    byte_counts = np.zeros(256)
    for byte in byte_data:
        byte_counts[byte] += 1
    byte_distribution = byte_counts / len(byte_data)

    ext = os.path.splitext(file_path)[1][1:].lower()

    if le_ext:
        if ext not in le_ext.classes_:
            raise ValueError(f"Unknown file extension: '{ext}'")
        ext_encoded = int(le_ext.transform([ext])[0])
    else:
        # Fallback: use dummy number (you can create a mapping if needed)
        ext_encoded = 0

    features = [ext_encoded, file_size, entropy, avg_line_length, column_count, bit_depth]
    features.extend(byte_distribution)

    columns = ['file_extension', 'file_size', 'entropy', 'avg_line_length', 'column_count', 'bit_depth'] + \
              [f'byte_{i}' for i in range(256)]

    return pd.DataFrame([features], columns=columns)

In [None]:
file_path = r"C:\Users\prana\Downloads\WhatsApp Image 2025-04-18 at 22.51.05_ec2ebf93.jpg"

try:
    X = extract_features_from_file(file_path, le_ext)
    y_pred_encoded = model.predict(X)
    y_pred_label = le_target.inverse_transform(y_pred_encoded)

    print(f"Using the model {m} Predicted best compression tool for {file_path}: {y_pred_label[0]}")
except Exception as e:
    print(f"Error: {e}")