In [3]:
from create_captions import CreateCaption
from pyannote.audio import Pipeline
from utils import download_youtube_video
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
from utils import SpeakerDiarization
import whisper
import torch

# Please provide Video details and tokens below

In [4]:
# Video link to summarize
VIDEO_LINK = "https://www.youtube.com/watch?v=O-IitVoKASo"
# Authentication token from hugging face to load VLM
PYAN_AUTH_TOKEN = "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"
# VLM model to be used 'Salesforce/blip2-flan-t5-xxl' or '"liuhaotian/llava-v1.5-7b'
VLM_MODEL = "Salesforce/blip2-flan-t5-xxl"

# Loading Speaker diarization module

In [5]:
device = torch.device("cuda")


def load_models():
    """
    Load Whisper and pyannote audio models for processing transcript.
    """

    model = whisper.load_model("base")
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization",
        use_auth_token=PYAN_AUTH_TOKEN,
    ).to(device)
    sd = SpeakerDiarization(model, pipeline)
    return sd


sd = load_models()

In [None]:
video_path = download_youtube_video(VIDEO_LINK, "video.mp4")

In [4]:
# Loading VLM will take some time for the first time run to download weights

cc = CreateCaption(
    device,
    gm_loc="gmflow_sintel-0c07dcb3.pth",
    model_type=VLM_MODEL,
    frames_to_skip=25,  # number of frames to skip before analysing default is 5 but increase to process faster
    batch_size=64,  # Decrease batch size if you have low gpu VRAM
)
get_captions = lambda video_loc: cc.captions(video_loc)

Loading GMFLow
Loading VLM


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Loaded Models


In [5]:
# get captions for important frames
caps = get_captions(r"videos/video.mp4")

[---------------------------------------->] 
  
             Total frames read 1120/5668 
             time = 186.85333333333332
             good frames =45
             1/1 videos 
time 0.0s 
time_spent 199.0641849040985s


In [6]:
# Delete CreateCaptions object to free VRAM if needed
# del cc
# torch.cuda.empty_cache()

In [7]:
# Get access to LLama 2 - 13B
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf")
original_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-13b-chat-hf", load_in_4bit=True, device_map="auto"
)

# Load PEFT adapter
peft_model_id = "Basha738/outputs"
config = PeftConfig.from_pretrained(peft_model_id)
adapter_model = PeftModel.from_pretrained(original_model, peft_model_id)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
from transformers import set_seed


def gen(text):
    toks = tokenizer(text, return_tensors="pt").to("cuda")

    set_seed(32)
    adapter_model.eval()
    with torch.no_grad():
        out = adapter_model.generate(
            **toks,
            max_new_tokens=350,
            top_k=5,
            do_sample=True,
        )
    return tokenizer.decode(
        out[0][len(toks["input_ids"][0]) :], skip_special_tokens=True
    )

In [10]:
template = """### Instruction
Given the transcript of a video with identified speakers as "Speaker 1: ..." and "Speaker 2: ...", along with captions highlighting key moments in the video, your objective is to create a concise summary of the video's content. Your analysis will consider both the transcript and the captions, without explicitly referring to them.

Please pay attention to the primary speaker. If speaker names are mentioned in the captions, please infer them. However, if names are not clear, proceed to generate the summary without assuming them as "Speaker 1" or "Speaker 2".

Based on the tone of the video, change the tone in the square brackets to one of the following:
['Informative', 'Neutral', 'Relaxation', 'Encouraging', 'Enthusiastic', 'Frustration', 'Cautious', 'Sarcasm', 'Optimistic']

Based on the category of the video, change the category in the square brackets to one of the following:
['Finance', 'Investment', 'Business', 'Entrepreneurship', 'Branding', 'Macroeconomics', 'Real Estate ', 'Economical Situation', 'Entertainment Market', 'Business Ethics', 'Real Estate']

# Transcript:
{transcript}

# Captions:
{captions}

### Response:
Summary:
```"""

In [12]:
script = sd.get_script("videos/video.mp4")

In [13]:
print(script)


SPEAKER_00: just a few minutes away from the closing bell now. Let's bring in Yahoo Finance's year at Blikery to break down the market action. Hello, my friend. What do we
SPEAKER_01: here? We're seeing some outperformance by the NASDAQ yet again. And this is what's happening this year. NASDAQ up 2 % more than doubling the Dow's performance of only 0 .8 % S &P 500 up 1 and a quarter percent Russell 2000 somewhere in the middle. Now you're going to see tech that's XLK and the upper left there. That is up over 2 % followed by communication services. And communication, excuse me, discretionary. There we go. Discretionary category is up 1 .6%. Now this is interesting because guess what? The best performing sector of the year is it's XLC. That's communication services. Yet on a trailing one -year basis, you'll see that this is the worst sectors, down 25 % down considerably more last year. So let me just show you what's going on inside the sector here. Also formally the telecommunication se

In [17]:
print(gen(template.format(transcript=script, captions={x[0] for x in caps})))


The market is seeing outperformance by the NASDAQ, with the communication services sector leading the way. The best performing sector of the year is XLC, despite being one of the worst performers last year. Bitcoin has seen a significant rally, up nearly 50% from its lows. E-commerce is picking up, with Shopify having its best day since November. The retail sector is also doing well, with Shopify up 25% this year and 85% off its fourth-quarter low.
```


```
Category : 'Finance'
Tone : 'Informative'
```



# Enter your youtube link below

In [None]:
VIDEO_LINK = "<ENTER VIDEO LINK TO PROCESS>"
video_path = download_youtube_video(VIDEO_LINK, "video_2.mp4")
caps = get_captions(r"videos/video_2.mp4")
script = sd.get_script("videos/video_2.mp4")
print(gen(template.format(transcript=script, captions={x[0] for x in caps})))