<a href="https://colab.research.google.com/github/sowmyavyk/Bot-LLM/blob/master/Video_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import cv2
import torch
import numpy as np
from PIL import Image
from torchvision import transforms, models
import os
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import nltk

# Download required NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')

# Configuration
PARTS_DIR = "/content/images"
EXPLANATION_TEXT = """
    Once the model has been setup as per the instruction manual, connect the battery as indicated next to the DPDT terminals.

    As the supply from the battery starts, the force generated by the DC motor rotates the connected gears. This turning of the gear, in turn rotates the associated gear and pulls the weight against gravity. Electric energy is converted to mechanical energy during this process because it involves movement caused by a force generated by a DC motor .

    The DC motor in the model also works as a generator here, lets see how?

    When the supply from the battery is turned off, the force of gravity acts on the weight and causes the gears to rotate in another direction. The DC motor contains coils of wire and magnets, when the gear turns, it makes the coils of wire spin inside the magnetic field created by the magnets.This spinning motion induces an electric current to flow through the wires into the LED ,which glows!! Converting the mechanical energy of the gear rotation into the electrical energy through the generator.

    This way! We can harness the force of gravity to generate electricity."""

class EnhancedPartClassifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.model = models.resnet18(pretrained=True)
        self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)

    def forward(self, x):
        return self.model(x)

class PartDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_paths = []
        self.labels = []

        images = sorted([f for f in os.listdir(root_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
        for idx, filename in enumerate(images):
            self.image_paths.append(os.path.join(root_dir, filename))
            self.labels.append(idx)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, self.labels[idx]

class VideoUnderstandingSystem:
    def __init__(self, parts_dir, explanation_text):
        self.parts_dir = parts_dir
        self.explanation_text = explanation_text
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # Image preprocessing transforms
        self.classifier_transform = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

        # Initialize components
        self.component_db = self.create_component_database()
        self.classifier = self.train_part_classifier()
        self.qa_processor = self.create_knowledge_graph()

        # Separate feature extractor for images
        self.image_feature_extractor = models.resnet18(pretrained=True).to(self.device)
        self.image_feature_extractor.eval()

        # Text feature extractor
        self.text_feature_extractor = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

        # Question generator
        self.question_generator = pipeline(
            "text2text-generation",
            model="mrm8488/t5-base-finetuned-question-generation-ap",
            device=0 if torch.cuda.is_available() else -1,
            num_beams=4,
            do_sample=True,
            temperature=0.7
        )

    def train_part_classifier(self):
        transform = transforms.Compose([
            transforms.Resize(256),
            transforms.RandomRotation(30),
            transforms.RandomHorizontalFlip(),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

        dataset = PartDataset(PARTS_DIR, transform=transform)
        loader = DataLoader(dataset, batch_size=32, shuffle=True)

        model = EnhancedPartClassifier(len(dataset)).to(self.device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.0001)

        # Training loop
        for epoch in range(15):
            model.train()
            for images, labels in loader:
                images, labels = images.to(self.device), labels.to(self.device)
                optimizer.zero_grad()
                outputs = model(images)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

        return model.eval()
    def analyze_movement(self, prev_detection, current_detection):
        """Simple movement analysis between frames"""
        if prev_detection['part_id'] == current_detection['part_id']:
            return "Stationary" if abs(current_detection['similarity'] - prev_detection['similarity']) < 0.1 else "Moving"
        return "Different component detected"

    def get_image_features(self, image):
        """Extract features using ResNet18"""
        if isinstance(image, Image.Image):
            image = self.classifier_transform(image).unsqueeze(0)

        with torch.no_grad():
            features = self.image_feature_extractor(image.to(self.device))
        return features.squeeze()

    def compare_with_reference(self, image, part_id):
        """Compare with reference image using ResNet features"""
        try:
            # Process input image
            query_features = self.get_image_features(image)

            # Load reference image
            ref_path = os.path.join(self.parts_dir, f"{part_id}.jpg")
            if not os.path.exists(ref_path):
                return 0.0

            ref_image = Image.open(ref_path).convert('RGB')
            ref_features = self.get_image_features(ref_image)

            # Calculate cosine similarity
            cos_sim = torch.nn.CosineSimilarity(dim=0)
            return cos_sim(query_features, ref_features).item()
        except Exception as e:
            print(f"Error comparing images: {e}")
            return 0.0

    def create_component_database(self):
        return {
          "part_01": {
              "name": "DC Motor/Generator",
              "function": "Converts electrical energy to mechanical energy and vice versa",
              "video_behavior": "Rotates when powered, generates electricity when spun"
          },
          "part_02": {
              "name": "Battery",
              "function": "Provides electrical power to the system",
              "video_behavior": "Supplies energy to the circuit, enabling the motor to rotate"
          },
          "part_03": {
              "name": "LED Indicator",
              "function": "Indicates power status or voltage level",
              "video_behavior": "Glows when the circuit is complete and power is supplied"
          },
          "part_04": {
              "name": "Switch",
              "function": "Controls the flow of electricity in the circuit",
              "video_behavior": "Turns the motor on or off when toggled"
          },
          "part_05": {
              "name": "Resistor",
              "function": "Limits current flow and protects components",
              "video_behavior": "Remains stationary, ensures stable current supply"
          }
      }

    def create_knowledge_graph(self):
        sentences = nltk.sent_tokenize(self.explanation_text)
        cleaned_sentences = [s.strip() for s in sentences if len(s) > 15]

        for part in self.component_db.values():
            cleaned_sentences.append(f"{part['name']}: {part['function']}")
            cleaned_sentences.append(f"Behavior: {part['video_behavior']}")

        return QAProcessor(cleaned_sentences)

    def analyze_video(self, video_path):
      cap = cv2.VideoCapture(video_path)
      results = []

      if not cap.isOpened():
          raise ValueError(f"Could not open video file: {video_path}")

      try:
          while cap.isOpened():
              ret, frame = cap.read()
              if not ret:
                  break

              pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
              detection = self.detect_components(pil_img)

              if len(results) > 0:
                  detection['movement'] = self.analyze_movement(results[-1], detection)

              results.append(detection)
      except Exception as e:
          print(f"Error processing video: {e}")
      finally:
          cap.release()

      if not results:
          print("No frames processed. Check video file format and contents.")

      return results

    def detect_components(self, image):
        transformed = self.classifier_transform(image).unsqueeze(0).to(self.device)

        with torch.no_grad():
            outputs = self.classifier(transformed)
            probs = torch.nn.functional.softmax(outputs, dim=1)
            conf, pred = torch.max(probs, 1)

        part_id = f"part_{pred.item()+1:02d}"
        return {
            'part_id': part_id,
            'confidence': conf.item(),
            'component_info': self.component_db.get(part_id, {}),
            'similarity': self.compare_with_reference(image, part_id)
        }
    def generate_questions(self, num=5):
        # Context-aware question generation
        return self.question_generator(
            "Generate technical questions about this physics demonstration: " + self.explanation_text,
            max_length=128,
            num_return_sequences=num,
            temperature=0.9,
            repetition_penalty=2.0
        )

    # Rest of the methods...

class QAProcessor:
    def __init__(self, knowledge_base):
        self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        self.knowledge = knowledge_base
        self.embeddings = self.model.encode(self.knowledge)

    def answer_question(self, question):
        question_embed = self.model.encode(question)
        scores = util.cos_sim(question_embed, self.embeddings)[0]
        top_3 = torch.topk(scores, k=3)
        return "\n".join([self.knowledge[i] for i in top_3.indices])

def main():
    system = VideoUnderstandingSystem(PARTS_DIR, EXPLANATION_TEXT)

    # Example usage
    video_results = system.analyze_video("/content/video.mp4")
    print("First frame:", video_results[0])

    questions = system.generate_questions(3)
    print("\nGenerated Questions:")
    for i, q in enumerate(questions, 1):
        print(f"{i}. {q['generated_text']}")

    # Q&A loop
    while True:
        question = input("\nAsk about the system (q to quit): ")
        if question.lower() == 'q':
            break
        print("Answer:", system.qa_processor.answer_question(question))

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
Device set to use cuda:0


First frame: {'part_id': 'part_04', 'confidence': 0.43060410022735596, 'component_info': {'name': 'Switch', 'function': 'Controls the flow of electricity in the circuit', 'video_behavior': 'Turns the motor on or off when toggled'}, 'similarity': 0.0}

Generated Questions:
1. question: What are the technical questions about this model?
2. question: What are the technical questions about the DC motor?
3. question: What are the technical questions about the model?

Ask about the system (q to quit): WHAT DOES THE MOTOR DOES IN THIS MODEL
Answer: The DC motor in the model also works as a generator here, lets see how?
Behavior: Supplies energy to the circuit, enabling the motor to rotate
DC Motor/Generator: Converts electrical energy to mechanical energy and vice versa

Ask about the system (q to quit): WHAT IS WEIGHT DOING IN THIS
Answer: This turning of the gear, in turn rotates the associated gear and pulls the weight against gravity.
When the supply from the battery is turned off, the fo