In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("raw_dataset.csv")
print("Original Dataset:")
print(df.head())

# -----------------------------
# 1. Handling Missing Values
# -----------------------------
imputer = SimpleImputer(strategy='mean')
numeric_cols = df.select_dtypes(include=['int64','float64']).columns
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# For categorical columns:
imputer_cat = SimpleImputer(strategy='most_frequent')
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

# -----------------------------
# 2. Removing Duplicates
# -----------------------------
df.drop_duplicates(inplace=True)

# -----------------------------
# 3. Encoding Categorical Data
# -----------------------------
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

# -----------------------------
# 4. Detecting Outliers (IQR)
# -----------------------------
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    mask = (df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))
    df.loc[mask, col] = np.nan

# Re-impute after removing outliers
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# -----------------------------
# 5. Scaling
# -----------------------------
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Save cleaned dataset
df.to_csv("clean_dataset.csv", index=False)

print("\nPreprocessing Completed. Clean dataset saved as clean_dataset.csv")


import os
import cv2
import numpy as np

DATASET_PATH = "dataset/"   # change this

def load_images():
    images = []
    labels = []
    class_names = os.listdir(DATASET_PATH)

    for label_index, folder in enumerate(class_names):
        folder_path = os.path.join(DATASET_PATH, folder)

        for img_name in os.listdir(folder_path):
            img_path = os.path.join(folder_path, img_name)

            img = cv2.imread(img_path)
            img = cv2.resize(img, (128, 128))

            images.append(img)
            labels.append(label_index)

    return np.array(images), np.array(labels), class_names


X, y, classes = load_images()
print("Total Images:", len(X))
print("Classes:", classes)


import os
import cv2

INPUT = "dataset/"
OUTPUT = "preprocessed/"

os.makedirs(OUTPUT, exist_ok=True)

def preprocess():
    for folder in os.listdir(INPUT):
        input_path = os.path.join(INPUT, folder)
        output_path = os.path.join(OUTPUT, folder)

        os.makedirs(output_path, exist_ok=True)

        for img_name in os.listdir(input_path):

            img_path = os.path.join(input_path, img_name)
            img = cv2.imread(img_path)

            # Resize
            img = cv2.resize(img, (256, 256))

            # Normalize
            img = img / 255.0

            # Convert back to uint8 for saving
            img_uint8 = (img * 255).astype("uint8")

            # Save
            cv2.imwrite(os.path.join(output_path, img_name), img_uint8)

    print("Preprocessing Done âœ”")


preprocess()


import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models

DATASET = "dataset/"
IMG_SIZE = 128

def load_data():
    images = []
    labels = []
    class_names = os.listdir(DATASET)

    for label, folder in enumerate(class_names):
        folder_path = os.path.join(DATASET, folder)

        for img_name in os.listdir(folder_path):
            img_path = os.path.join(folder_path, img_name)

            img = cv2.imread(img_path)
            img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))

            images.append(img)
            labels.append(label)

    return np.array(images) / 255.0, np.array(labels), class_names


X, y, class_names = load_data()

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# CNN model
model = models.Sequential([
    layers.Conv2D(32, (3,3), activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 3)),
    layers.MaxPooling2D(2,2),

    layers.Conv2D(64, (3,3), activation='relu'),
    layers.MaxPooling2D(2,2),

    layers.Conv2D(128, (3,3), activation='relu'),
    layers.MaxPooling2D(2,2),

    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(len(class_names), activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

model.save("image_classifier.h5")
print("Training complete! Model saved as image_classifier.h5")
