## Model Training

In [None]:
# Model training takes 35-37 minutes to train.

from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import os

device = "cuda" if torch.cuda.is_available() else "cpu"

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def extract_features(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        image_features = model.get_image_features(**inputs)
        image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
    return image_features.cpu().numpy().flatten()

features = []
labels = []

data_root = "data"
classes = os.listdir(data_root)

for label_idx, class_name in enumerate(classes):
    class_dir = os.path.join(data_root, class_name)
    for img_name in os.listdir(class_dir):
        img_path = os.path.join(class_dir, img_name)
        try:
            embedding = extract_features(img_path)
            features.append(embedding)
            labels.append(label_idx)
        except Exception as e:
            print(f"Error processing {img_path}: {e}")


## Model Testing

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=5000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"Classifier Accuracy: {acc * 100:.2f}%")


Classifier Accuracy: 97.72%


## Model Exporting

In [None]:
from joblib import dump

dump(clf, 'human_animal_classifier.joblib')

## Extracting Classes

In [23]:
import os

data_root = "data"
classes = os.listdir(data_root)
label_classes = []

for label_idx, class_name in enumerate(classes):
    label_classes.append(class_name)

print(label_classes)

['Cat', 'Cow', 'Deer', 'Dog', 'Goat', 'Human', 'Sheep']


## Loading Model and Detecting Classes

In [24]:
import cv2
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel
from joblib import load

clf = load('model/human_animal_classifier.joblib')

device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

class_labels = label_classes

def extract_features_pil(image):
    """Extract CLIP features from a PIL image."""
    inputs = clip_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        features = features / features.norm(p=2, dim=-1, keepdim=True)
    return features.cpu().numpy().flatten()

def predict_class(features):
    """Predict class label and confidence using trained classifier."""
    prob = clf.predict_proba([features])[0]
    predicted_idx = prob.argmax()
    confidence = prob[predicted_idx]
    label = class_labels[predicted_idx]
    return label, confidence

def detect_from_path(path, is_image=True):
    if is_image:
        image = Image.open(path).convert("RGB")
        features = extract_features_pil(image)
        label, confidence = predict_class(features)
        print(f"[IMAGE] Detected: {label} (Confidence: {confidence*100:.2f}%)")
        return label, confidence

    else:
        cap = cv2.VideoCapture(path)
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            features = extract_features_pil(image)
            label, confidence = predict_class(features)

            cv2.putText(frame, f"{label} ({confidence*100:.1f}%)", (10,30),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)
            cv2.imshow('Detection', frame)

            if confidence > 0.80:
                print(f"[VIDEO] ALERT: Detected {label} (Confidence: {confidence*100:.2f}%)")

            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

        cap.release()
        cv2.destroyAllWindows()
        return None, None

# === Usage ===
# For single image:
# detect_from_path("test_images/sample.jpg", is_image=True)

# For video:
# detect_from_path("test_videos/sample_video.mp4", is_image=False)


## Detecting From Video

In [25]:
detect_from_path("data_samples\human.mp4", is_image=False)

[VIDEO] ALERT: Detected Human (Confidence: 81.39%)
[VIDEO] ALERT: Detected Human (Confidence: 80.68%)
[VIDEO] ALERT: Detected Human (Confidence: 80.89%)
[VIDEO] ALERT: Detected Human (Confidence: 81.17%)
[VIDEO] ALERT: Detected Human (Confidence: 81.09%)
[VIDEO] ALERT: Detected Human (Confidence: 80.79%)
[VIDEO] ALERT: Detected Human (Confidence: 81.98%)
[VIDEO] ALERT: Detected Human (Confidence: 82.46%)
[VIDEO] ALERT: Detected Human (Confidence: 83.45%)
[VIDEO] ALERT: Detected Human (Confidence: 83.50%)
[VIDEO] ALERT: Detected Human (Confidence: 84.36%)
[VIDEO] ALERT: Detected Human (Confidence: 85.67%)
[VIDEO] ALERT: Detected Human (Confidence: 85.22%)
[VIDEO] ALERT: Detected Human (Confidence: 85.61%)
[VIDEO] ALERT: Detected Human (Confidence: 85.37%)
[VIDEO] ALERT: Detected Human (Confidence: 85.50%)
[VIDEO] ALERT: Detected Human (Confidence: 87.47%)
[VIDEO] ALERT: Detected Human (Confidence: 86.79%)
[VIDEO] ALERT: Detected Human (Confidence: 87.13%)
[VIDEO] ALERT: Detected Human (

(None, None)

In [26]:
detect_from_path("data_samples\dog.mp4", is_image=False)

[VIDEO] ALERT: Detected Dog (Confidence: 80.50%)
[VIDEO] ALERT: Detected Dog (Confidence: 82.74%)
[VIDEO] ALERT: Detected Dog (Confidence: 85.58%)
[VIDEO] ALERT: Detected Dog (Confidence: 85.25%)
[VIDEO] ALERT: Detected Dog (Confidence: 81.58%)
[VIDEO] ALERT: Detected Dog (Confidence: 88.46%)
[VIDEO] ALERT: Detected Dog (Confidence: 88.74%)
[VIDEO] ALERT: Detected Dog (Confidence: 90.55%)
[VIDEO] ALERT: Detected Dog (Confidence: 87.03%)
[VIDEO] ALERT: Detected Dog (Confidence: 89.02%)
[VIDEO] ALERT: Detected Dog (Confidence: 91.11%)
[VIDEO] ALERT: Detected Dog (Confidence: 88.66%)
[VIDEO] ALERT: Detected Dog (Confidence: 88.25%)
[VIDEO] ALERT: Detected Dog (Confidence: 83.07%)
[VIDEO] ALERT: Detected Dog (Confidence: 89.22%)
[VIDEO] ALERT: Detected Dog (Confidence: 84.67%)
[VIDEO] ALERT: Detected Dog (Confidence: 80.17%)
[VIDEO] ALERT: Detected Dog (Confidence: 83.53%)
[VIDEO] ALERT: Detected Dog (Confidence: 86.48%)
[VIDEO] ALERT: Detected Dog (Confidence: 88.50%)
[VIDEO] ALERT: Detec

(None, None)

## Detecting From Image

In [27]:
detect_from_path("data_samples/human.jpg", is_image=True) 

[IMAGE] Detected: Human (Confidence: 95.65%)


('Human', np.float64(0.9564624970180753))

In [28]:
detect_from_path("data_samples/deer.jpg", is_image=True) 

[IMAGE] Detected: Deer (Confidence: 99.58%)


('Deer', np.float64(0.9957980291114441))

In [29]:
detect_from_path("data_samples/cat.jpg", is_image=True) 

[IMAGE] Detected: Cat (Confidence: 97.97%)


('Cat', np.float64(0.9796721101137771))