<a href="https://colab.research.google.com/github/rianachatterjee04/GenAssist/blob/main/Initial_VLM_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing technologies

In [None]:
# Install required packages
!pip install transformers torch torchvision pillow timm
!pip install opencv-python matplotlib
!pip install numpy pandas tqdm

import torch
import time
from transformers import AutoProcessor, AutoModelForVision2Seq
import matplotlib.pyplot as plt
from PIL import Image
import cv2
import numpy as np
import pandas as pd
from datetime import datetime

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

Model Implementation

In [None]:
# Install required packages
!pip install transformers torch torchvision pillow timm
!pip install opencv-python matplotlib
!pip install numpy pandas tqdm

# Import necessary libraries
import torch
import time
from transformers import AutoProcessor, AutoModelForVision2Seq
import matplotlib.pyplot as plt
from PIL import Image
import cv2
import numpy as np
import pandas as pd
from datetime import datetime

def __init__(self, model_name="Efficient-Large-Model/VILA-2.7b"):  # Using VILA-2.7b
    """Initialize the VQA model"""
    self.device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {self.device}")

    print("Loading model...")
    self.processor = AutoProcessor.from_pretrained(model_name)
    self.model = AutoModelForVision2Seq.from_pretrained(
        model_name,
        load_in_4bit=True,  # Enable 4-bit quantization
        device_map="auto"
    ).to(self.device)
    print("Model loaded successfully!")

    self.inference_times = []

    def get_answer(self, image_path, question):
        start_time = time.time()

        # Load and preprocess image
        image = Image.open(image_path)

        # Prepare inputs
        inputs = self.processor(image, question, return_tensors="pt").to(self.device)

        # Generate answer
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=100,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
            )

        answer = self.processor.decode(outputs[0], skip_special_tokens=True)
        inference_time = time.time() - start_time
        self.inference_times.append(inference_time)

        return {
            "answer": answer,
            "inference_time": inference_time
        }

    def analyze_terrain(self, image_path):
        terrain_questions = [
            "Describe the terrain and surface you see in the image.",
            "List any hazards or obstacles visible in the image.",
            "Describe any wet, muddy, or slippery conditions in the image.",
            "What safety precautions should someone take when walking here?"
        ]

        results = {}
        print("\nAnalyzing terrain...")
        for question in terrain_questions:
            print(f"\nQ: {question}")
            result = self.get_answer(image_path, question)
            results[question] = result
            print(f"A: {result['answer']}")
            print(f"Time: {result['inference_time']:.2f}s")

        return results

    def get_performance_stats(self):
        if not self.inference_times:
            return "No inference times recorded"
        return {
            "average_time": np.mean(self.inference_times),
            "max_time": np.max(self.inference_times),
            "min_time": np.min(self.inference_times),
            "total_inferences": len(self.inference_times)
        }

def display_image(image_path):
    """Display an image with matplotlib"""
    img = Image.open(image_path)
    plt.figure(figsize=(10, 8))
    plt.imshow(img)
    plt.axis('off')
    plt.show()

def save_results(results, filename=None):
    """Save analysis results to CSV"""
    if filename is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"terrain_analysis_{timestamp}.csv"

    data = []
    for question, result in results.items():
        entry = {
            'Question': question,
            'Answer': result['answer'],
            'Inference_Time': result['inference_time']
        }
        data.append(entry)

    # Create and save DataFrame
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"\nResults saved to {filename}")

    # Display results in a clean format
    print("\nAnalysis Results:")
    for _, row in df.iterrows():
        print(f"\nQuestion: {row['Question']}")
        print(f"Answer: {row['Answer']}")
        print(f"Time: {row['Inference_Time']:.2f}s")

def create_performance_report(model):
    """Create a comprehensive performance report"""
    stats = model.get_performance_stats()
    if isinstance(stats, str):
        return stats

    report = {
        "Performance Metrics": {
            "Average Inference Time": f"{stats['average_time']:.3f} seconds",
            "Maximum Inference Time": f"{stats['max_time']:.3f} seconds",
            "Minimum Inference Time": f"{stats['min_time']:.3f} seconds",
            "Total Inferences": stats['total_inferences']
        }
    }
    return report

def test_single_image(image_path, custom_question=None):
    """Test VQA system on a single image"""
    print("\nInitializing VQA system...")
    vqa = TerrainVQA()

    print("\nDisplaying test image...")
    display_image(image_path)

    if custom_question:
        print(f"\nTesting custom question: {custom_question}")
        result = vqa.get_answer(image_path, custom_question)
        print(f"Answer: {result['answer']}")
        print(f"Inference time: {result['inference_time']:.2f} seconds")

    print("\nRunning terrain analysis...")
    terrain_results = vqa.analyze_terrain(image_path)

    save_results(terrain_results)

    print("\nPerformance Report:")
    print(create_performance_report(vqa))

# Main execution
print("Please upload your image...")
from google.colab import files
uploaded = files.upload()
image_path = next(iter(uploaded.keys()))
test_single_image(image_path)

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl (69.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.2
Please upload your image...


Saving girl.jpg to girl (5).jpg

Initializing VQA system...
Using device: cpu
Loading model...


tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/470 [00:00<?, ?B/s]

OSError: NousResearch/Obsidian-3B-V0.5 does not appear to have a file named preprocessor_config.json. Checkout 'https://huggingface.co/NousResearch/Obsidian-3B-V0.5/tree/main' for available files.

VILT-B32:

In [None]:
!pip install transformers torch torchvision pillow accelerate bitsandbytes

import torch
from transformers import ViltProcessor, ViltForQuestionAnswering, AutoProcessor, AutoModelForVision2Seq
from PIL import Image
import time
import pandas as pd
from datetime import datetime

class TerrainAnalyzer:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")

        print("Loading model...")
        # Use a stable VQA model that's publicly available
        self.processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
        self.model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa").to(self.device)
        print("Model loaded successfully!")

    def analyze_image(self, image_path):
        questions = [
            "Describe the terrain and surface you see in the image.",
            "List any hazards or obstacles visible in the image.",
            "Describe any wet, muddy, or slippery conditions in the image.",
            "What safety precautions should someone take when walking here?"
        ]

        results = []
        print("\nAnalyzing terrain...")

        for question in questions:
            start_time = time.time()

            # Process image and question
            image = Image.open(image_path)
            encoding = self.processor(image, question, return_tensors="pt")
            encoding = {k: v.to(self.device) for k, v in encoding.items()}

            # Get answer
            with torch.no_grad():
                outputs = self.model(**encoding)
                answer = self.model.config.id2label[outputs.logits.argmax(-1).item()]

            inference_time = time.time() - start_time

            print(f"\nQ: {question}")
            print(f"A: {answer}")
            print(f"Time: {inference_time:.2f}s")

            results.append({
                "Question": question,
                "Answer": answer,
                "Time": f"{inference_time:.2f}s"
            })

        # Save results
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"terrain_analysis_{timestamp}.csv"
        df = pd.DataFrame(results)
        df.to_csv(filename, index=False)
        print(f"\nResults saved to {filename}")

        print("\nAnalysis Results:")
        for result in results:
            print(f"\nQuestion: {result['Question']}")
            print(f"Answer: {result['Answer']}")
            print(f"Time: {result['Time']}")

# Run the analysis
print("Please upload your image...")
from google.colab import files
uploaded = files.upload()
image_path = next(iter(uploaded.keys()))

analyzer = TerrainAnalyzer()
analyzer.analyze_image(image_path)

Please upload your image...


Saving girl.jpg to girl (8).jpg
Using device: cpu
Loading model...


preprocessor_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/136k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/470M [00:00<?, ?B/s]

Model loaded successfully!

Analyzing terrain...


model.safetensors:   0%|          | 0.00/470M [00:00<?, ?B/s]


Q: Describe the terrain and surface you see in the image.
A: sidewalk
Time: 1.71s

Q: List any hazards or obstacles visible in the image.
A: no
Time: 1.76s

Q: Describe any wet, muddy, or slippery conditions in the image.
A: no
Time: 0.88s

Q: What safety precautions should someone take when walking here?
A: safety
Time: 1.24s

Results saved to terrain_analysis_20250224_001227.csv

Analysis Results:

Question: Describe the terrain and surface you see in the image.
Answer: sidewalk
Time: 1.71s

Question: List any hazards or obstacles visible in the image.
Answer: no
Time: 1.76s

Question: Describe any wet, muddy, or slippery conditions in the image.
Answer: no
Time: 0.88s

Question: What safety precautions should someone take when walking here?
Answer: safety
Time: 1.24s


new

In [None]:
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
from PIL import Image
import time
import pandas as pd
from datetime import datetime
import os

class TerrainAnalyzer:
    def __init__(self, model_name="Salesforce/blip-image-captioning-base"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")

        print(f"Loading model {model_name}...")

        try:
            self.processor = AutoProcessor.from_pretrained(model_name)
            self.model = AutoModelForVision2Seq.from_pretrained(model_name).to(self.device)
            print("Model loaded successfully!")
        except Exception as e:
            print(f"Error loading model: {e}")
            raise

    def analyze_image(self, image_path):
        questions = [
            "Describe the terrain and surface in this image.",
            "What potential hazards or obstacles can be seen?",
            "Are there any notable environmental conditions?",
            "What safety considerations should be taken into account?"
        ]

        results = []
        print("\nAnalyzing image...")

        # Open the image
        image = Image.open(image_path)

        for question in questions:
            start_time = time.time()

            # Prepare inputs
            inputs = self.processor(images=image, text=question, return_tensors="pt").to(self.device)

            # Generate response
            with torch.no_grad():
                outputs = self.model.generate(**inputs, max_new_tokens=50)

            # Decode the response
            answer = self.processor.decode(outputs[0], skip_special_tokens=True)
            inference_time = time.time() - start_time

            print(f"\nQ: {question}")
            print(f"A: {answer}")
            print(f"Time: {inference_time:.2f}s")

            results.append({
                "Question": question,
                "Answer": answer,
                "Time": f"{inference_time:.2f}s"
            })

        # Save results
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"image_analysis_{timestamp}.csv"
        df = pd.DataFrame(results)
        df.to_csv(filename, index=False)
        print(f"\nResults saved to {filename}")

        print("\nAnalysis Results:")
        for result in results:
            print(f"\nQuestion: {result['Question']}")
            print(f"Answer: {result['Answer']}")
            print(f"Time: {result['Time']}")

def main():
    # Prompt for image path
    image_path = input("Please enter the path to the image: ")

    # Validate image path
    if not os.path.exists(image_path):
        print(f"Error: Image file {image_path} does not exist.")
        return

    # Create and run analyzer
    try:
        analyzer = TerrainAnalyzer()
        analyzer.analyze_image(image_path)
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()

Please enter the path to the image: girl.jpg
Using device: cpu
Loading model Salesforce/blip-image-captioning-base...


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Model loaded successfully!

Analyzing image...

Q: Describe the terrain and surface in this image.
A: describe the terrain and surface in this image. girl crossing the street
Time: 4.16s

Q: What potential hazards or obstacles can be seen?
A: what potential hazards or obstacles can be seen? a girl crossing a street
Time: 3.22s

Q: Are there any notable environmental conditions?
A: are there any notable environmental conditions? school uniforms, school uniforms, school uniforms, school uniforms, school uniform, school uniform, school uniforms, school uniforms for girls, school uniforms, school uniform design, school uniform design, school uniform design, school, school uniform design, school, school
Time: 14.90s

Q: What safety considerations should be taken into account?
A: what safety considerations should be taken into account?
Time: 1.89s

Results saved to image_analysis_20250224_002229.csv

Analysis Results:

Question: Describe the terrain and surface in this image.
Answer: describ