In [None]:
%pip install sentence_transformers umap umap-learn langchain cohere faiss textract moviepy google-cloud-speech pandas

In [None]:
%pip install opencv-python-headless ipywidgets pytube

In [None]:
!pip install git+https://github.com/openai/whisper.git
!sudo apt update && sudo apt install ffmpeg
!pip install --upgrade moviepy
!pip install librosa

import whisper
import time
import librosa
import soundfile as sf
import re
import os
import pandas as pd

base_model = whisper.load_model("base.en")
small_model = whisper.load_model("small.en")

In [None]:
import os
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
from moviepy.editor import VideoFileClip

input_folder = "files/"
output_folder = "fragments/"
os.makedirs(output_folder, exist_ok=True)

def split_video_into_fragments(input_video_file, fragment_duration):
    fragment_num = 61

    clip = VideoFileClip(input_video_file)
    duration = int(clip.duration)

    fragment_start = 0
    fragment_end = fragment_duration

    while fragment_end <= duration:
        output_file = os.path.join(output_folder, f"fragment_{fragment_num}.mov")
        ffmpeg_extract_subclip(input_video_file, fragment_start, fragment_end, targetname=output_file)

        print(f"Processed fragment {fragment_num}")

        fragment_start += fragment_duration
        fragment_end += fragment_duration
        fragment_num += 1

    clip.close()

for filename in os.listdir(input_folder):
    if filename.endswith(".MOV"):
        input_video_file = os.path.join(input_folder, filename)
        split_video_into_fragments(input_video_file, 10)

In [None]:
transcriptions = []
file_names = []

for i in range(61, 66):
    video_file = f'fragment_{i}.mov'
    video_path = "fragments/" + video_file
    audio_path = "audio/" + video_file[:-4] + ".wav"

    y, sr = librosa.load(video_path, sr=16000)
    sf.write(audio_path, y, sr)

    result = small_model.transcribe(audio_path)
    text = result["text"].strip()

    text_file = video_file[:-4] + ".txt"
    text_path = "video_fragments/" + text_file
    transcriptions.append(text)
    file_names.append(video_file)

In [None]:
%pip install sentence-transformers
%pip install umap-learn opencv-python Pillow
import umap.umap_ as umap

In [None]:
from transformers import pipeline
import cv2
from PIL import Image
import os

captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
interval = 10
output_directory = "frame_images"
os.makedirs(output_directory, exist_ok=True)

clip_results = []

for i in range(61, 66):
    video_file = f'fragment_{i}.mov'
    video_path = "fragments/" + video_file
    cap = cv2.VideoCapture(video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frames_to_skip = fps * interval
    frame_count = 0
    texts = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % frames_to_skip == 0:
            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            captions = captioner(image)
            if captions:
                texts.append(captions[0]['generated_text'])
        frame_count += 1

    cap.release()
    long_sentence = ' '.join(texts)

    clip_results.append(long_sentence)

In [None]:
clip_results

In [None]:
df = pd.DataFrame({'File Name': file_names, 'Transcription': transcriptions, 'Context': clip_results })
df.to_csv('df2.csv')
df