<a href="https://colab.research.google.com/github/sangramkesharidash/Google-Colab-files/blob/main/TradeVision_YouTube_ConCall_Sentiment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -------------------------------------------------
# 1. Install & Import
# -------------------------------------------------
!pip install -q yt-dlp openai-whisper ffmpeg-python

import yt_dlp, os, re, shutil, subprocess
from google.colab import drive
drive.mount('/content/drive')

# -------------------------------------------------
# 2. SETTINGS
# -------------------------------------------------
base_directory = "/content/drive/My Drive/DREAM-PROJECT/YOUTUBE/FILES"
os.makedirs(base_directory, exist_ok=True)

youtube_url = "https://www.youtube.com/watch?v=XEV2Owi-7B4"

# -------------------------------------------------
# 3. Extract Video ID
# -------------------------------------------------
video_id = re.search(r"v=([a-zA-Z0-9_-]{11})", youtube_url).group(1)
print(f"Video ID: {video_id}")

# -------------------------------------------------
# 4. CREATE FRESH FOLDER
# -------------------------------------------------
video_dir = os.path.join(base_directory, video_id)
if os.path.exists(video_dir):
    print(f"Removing old data: {video_dir}")
    shutil.rmtree(video_dir)
os.makedirs(video_dir, exist_ok=True)

# -------------------------------------------------
# 5. DOWNLOAD FRESH AUDIO (FORCE NO CACHE)
# -------------------------------------------------
def download_audio(url, path):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': os.path.join(path, '%(title)s.%(ext)s'),
        'quiet': False,
        'no_warnings': False,
        'ignoreerrors': False,
        'retries': 10,
        'fragment_retries': 10,
        'extractor_retries': 10,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=True)
        return info['title'], info['ext']

print("Downloading fresh audio...")
title, ext = download_audio(youtube_url, base_directory)
print(f"Downloaded: {title}.{ext}")

# -------------------------------------------------
# 6. Move to video folder
# -------------------------------------------------
downloaded_file = f"{title}.{ext}"
old_path = os.path.join(base_directory, downloaded_file)
new_path = os.path.join(video_dir, downloaded_file.replace(" ", "_"))
shutil.move(old_path, new_path)
print(f"Moved to: {new_path}")

# -------------------------------------------------
# 7. Convert to MP3
# -------------------------------------------------
def to_mp3(inp, out):
    subprocess.run([
        'ffmpeg', '-i', inp, '-q:a', '0', '-map', 'a', out, '-y'
    ], check=True, stdout=subprocess.DEVNULL)

audio_mp3 = os.path.join(video_dir, "audio.mp3")
to_mp3(new_path, audio_mp3)
print(f"MP3: {audio_mp3}")

# -------------------------------------------------
# 8. Split & Transcribe
# -------------------------------------------------
def split_audio(mp3, sec=600, prefix="seg"):
    pattern = os.path.join(video_dir, f"{prefix}%03d.mp3")
    subprocess.run([
        'ffmpeg', '-i', mp3, '-f', 'segment', '-segment_time', str(sec),
        '-c', 'copy', pattern, '-y'
    ], check=True, stdout=subprocess.DEVNULL)

split_audio(audio_mp3)

def transcribe(prefix="seg"):
    i = 0
    while True:
        f = os.path.join(video_dir, f"{prefix}{i:03d}.mp3")
        if not os.path.exists(f): break
        print(f"Transcribing {os.path.basename(f)}...")
        subprocess.run([
            'whisper', f, '--model', 'base', '--language', 'en',
            '--output_dir', video_dir, '--output_format', 'txt'
        ], check=True)
        i += 1
    print("Done!")

transcribe()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Video ID: XEV2Owi-7B4
Removing old data: /content/drive/My Drive/DREAM-PROJECT/YOUTUBE/FILES/XEV2Owi-7B4
Downloading fresh audio...
[youtube] Extracting URL: https://www.youtube.com/watch?v=XEV2Owi-7B4
[youtube] XEV2Owi-7B4: Downloading webpage
[youtube] XEV2Owi-7B4: Downloading android sdkless player API JSON
[youtube] XEV2Owi-7B4: Downloading tv client config
[youtube] XEV2Owi-7B4: Downloading tv player API JSON
[youtube] XEV2Owi-7B4: Downloading web safari player API JSON
[youtube] XEV2Owi-7B4: Downloading player c6d7bdc9-main


         player = https://www.youtube.com/s/player/c6d7bdc9/player_ias.vflset/en_US/base.js
         n = i9jePnzzQFuBjAoN ; player = https://www.youtube.com/s/player/c6d7bdc9/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U


[youtube] XEV2Owi-7B4: Downloading m3u8 information
[info] XEV2Owi-7B4: Downloading 1 format(s): 251
[download] /content/drive/My Drive/DREAM-PROJECT/YOUTUBE/FILES/Happiest Minds Technologies  Q2FY26 Earnings Concall.webm has already been downloaded
[download] 100% of   49.29MiB
Downloaded: Happiest Minds Technologies  Q2FY26 Earnings Concall.webm
Moved to: /content/drive/My Drive/DREAM-PROJECT/YOUTUBE/FILES/XEV2Owi-7B4/Happiest_Minds_Technologies__Q2FY26_Earnings_Concall.webm
MP3: /content/drive/My Drive/DREAM-PROJECT/YOUTUBE/FILES/XEV2Owi-7B4/audio.mp3
Transcribing seg000.mp3...
Transcribing seg001.mp3...
Transcribing seg002.mp3...
Transcribing seg003.mp3...
Transcribing seg004.mp3...
Transcribing seg005.mp3...
Transcribing seg006.mp3...
Done!


In [2]:
import os

video_dir = "/content/drive/My Drive/DREAM-PROJECT/YOUTUBE/FILES/XEV2Owi-7B4"

if not os.path.exists(video_dir):
    print("FOLDER NOT FOUND!")
    print("Run the FULL pipeline again (download + transcribe) WITHOUT deleting the folder.")
else:
    print(f"Folder exists: {video_dir}")
    print("Files inside:")
    !ls -la "{video_dir}" | head -20

FOLDER NOT FOUND!
Run the FULL pipeline again (download + transcribe) WITHOUT deleting the folder.


In [6]:
# =============================================
# LIST ALL FILES & FOLDERS UNDER YOUTUBE/FILES
# =============================================
import os
from datetime import datetime
import os, re, shutil, subprocess
from google.colab import drive
drive.mount('/content/drive')
base_path = "/content/drive/My Drive/DREAM-PROJECT/YOUTUBE/FILES"

print(f"Scanning: {base_path}\n")
print("=" * 80)
print(f"{'TYPE':<6} {'SIZE (MB)':>10} {'MODIFIED':<20} {'NAME'}")
print("=" * 80)

total_dirs = 0
total_files = 0
total_size = 0

# Walk through all subdirectories
for root, dirs, files in os.walk(base_path):
    # Count directories
    total_dirs += len(dirs)

    # Print each file
    for f in files:
        file_path = os.path.join(root, f)
        try:
            stat = os.stat(file_path)
            size_mb = stat.st_size / (1024 * 1024)  # MB
            mtime = datetime.fromtimestamp(stat.st_mtime).strftime("%b %d %I:%M %p")
            rel_path = os.path.relpath(file_path, base_path)
            file_type = "FILE" if os.path.isfile(file_path) else "DIR "

            print(f"{file_type:<6} {size_mb:9.2f}  {mtime:<20} {rel_path}")
            total_files += 1
            total_size += stat.st_size
        except:
            print(f"{'ERR ':<6} {'-':>10}  {'-':<20} {rel_path}")

# Print summary
print("=" * 80)
print(f"TOTAL DIRECTORIES : {total_dirs}")
print(f"TOTAL FILES       : {total_files}")
print(f"TOTAL SIZE        : {total_size / (1024*1024):.2f} MB")
print(f"LOCATION          : {base_path}")

Mounted at /content/drive
Scanning: /content/drive/My Drive/DREAM-PROJECT/YOUTUBE/FILES

TYPE    SIZE (MB) MODIFIED             NAME
FILE        9.25  Dec 15 01:25 PM      audio.mp3
FILE       19.30  Nov 01 06:50 PM      What is MCP？ Integrate AI Agents with Databases & APIs.mp4
FILE        5.17  Nov 01 07:09 PM      What_is_MCP_Integrate_AI_Agents_with_Databases__APIs.mp3
FILE       40.11  Nov 01 07:16 PM      TTK Prestige Q2FY26 Earnings Concall.webm
FILE        7.89  Nov 01 07:16 PM      CaeFzbJynWY/Best_Stocks_To_Buy_Now____YesBank_Ready_To_Blast_____Biggest_Swing_Trade_Opportunity_in_year__24-25.webm
FILE       10.79  Nov 01 07:16 PM      CaeFzbJynWY/audio.mp3
FILE       10.79  Nov 01 07:16 PM      CaeFzbJynWY/seg000.mp3
FILE        0.00  Nov 01 07:18 PM      CaeFzbJynWY/seg000.txt
FILE       49.29  Nov 01 07:28 PM      XEV2Owi-7B4/Happiest_Minds_Technologies__Q2FY26_Earnings_Concall.webm
FILE       42.53  Nov 01 07:29 PM      XEV2Owi-7B4/audio.mp3
FILE        6.80  Nov 01 07:29 P

In [7]:
# =============================================
# COMBINE ALL TRANSCRIPTION SEGMENTS
# =============================================
import os

# --- CONFIGURATION ---
video_id = "XEV2Owi-7B4"
base_dir = "/content/drive/My Drive/DREAM-PROJECT/YOUTUBE/FILES"
video_dir = os.path.join(base_dir, video_id)

# Output file
full_transcript_path = os.path.join(video_dir, "FULL_HAPPIEST_MINDS_Q2FY26_TRANSCRIPT.txt")

print("Combining all .txt segments into one file...\n")

# --- COMBINE LOGIC ---
with open(full_transcript_path, "w", encoding="utf-8") as full_file:
    full_file.write("HAPPIEST MINDS TECHNOLOGIES\n")
    full_file.write("Q2 FY26 EARNINGS CONFERENCE CALL - FULL TRANSCRIPT\n")
    full_file.write("=" * 85 + "\n\n")

    segment_count = 0
    total_chars = 0

    for i in range(100):  # Support up to 100 segments
        txt_file = os.path.join(video_dir, f"seg{i:03d}.txt")
        if not os.path.exists(txt_file):
            break

        with open(txt_file, "r", encoding="utf-8") as seg:
            content = seg.read().strip()
            if not content:
                continue

            start_min = i * 10
            end_min = (i + 1) * 10
            full_file.write(f"[{start_min:02d}:00 - {end_min:02d}:00]  SEGMENT {i:03d}\n")
            full_file.write("-" * 60 + "\n")
            full_file.write(content + "\n\n")

            segment_count += 1
            total_chars += len(content)

    # --- SUMMARY ---
    full_file.write("\n" + "=" * 85 + "\n")
    full_file.write("TRANSCRIPTION SUMMARY\n")
    full_file.write("=" * 85 + "\n")
    full_file.write(f"Total Segments: {segment_count}\n")
    full_file.write(f"Total Duration: {segment_count * 10} minutes\n")
    full_file.write(f"Total Characters: {total_chars:,}\n")
    full_file.write(f"Estimated Words: {total_chars // 5:,}\n")
    full_file.write(f"Generated on: {__import__('datetime').datetime.now().strftime('%Y-%m-%d %I:%M %p')}\n")

print(f"Combined transcript saved!\n")
print(f"File: {full_transcript_path}\n")
print(f"Segments: {segment_count} | Duration: {segment_count * 10} min | Words: ~{total_chars // 5:,}")

Combining all .txt segments into one file...

Combined transcript saved!

File: /content/drive/My Drive/DREAM-PROJECT/YOUTUBE/FILES/XEV2Owi-7B4/FULL_HAPPIEST_MINDS_Q2FY26_TRANSCRIPT.txt

Segments: 7 | Duration: 70 min | Words: ~10,354


In [13]:
# =============================================
# DUAL LLM SUMMARISER – AUTO MODEL NAME IN FILENAME
# =============================================
!pip install -q google-generativeai transformers torch accelerate

import os, textwrap, time, torch
import google.generativeai as genai
from transformers import pipeline

# -------------------------------------------------
# 1. CONFIG
# -------------------------------------------------
GEMINI_API_KEY = "AIzaSyBwues9VXjn5ZOG3wu8asYlNW_JV9mKKq0"
genai.configure(api_key=GEMINI_API_KEY)

VIDEO_ID = "XEV2Owi-7B4"
BASE_DIR = "/content/drive/My Drive/DREAM-PROJECT/YOUTUBE/FILES"
VIDEO_DIR = os.path.join(BASE_DIR, VIDEO_ID)
TRANSCRIPT = os.path.join(VIDEO_DIR, "FULL_HAPPIEST_MINDS_Q2FY26_TRANSCRIPT.txt")

# -------------------------------------------------
# 2. AUTO-DETECT GEMINI MODEL
# -------------------------------------------------
print("Detecting available Gemini model...")
models = [m for m in genai.list_models() if 'generateContent' in m.supported_generation_methods]
gemini_model_name = None
for m in models:
    if "flash" in m.name.lower() or "pro" in m.name.lower():
        gemini_model_name = m.name
        break
if not gemini_model_name and models:
    gemini_model_name = models[0].name

if gemini_model_name:
    print(f"Using Gemini: {gemini_model_name}")
    gemini_model = genai.GenerativeModel(gemini_model_name)
    gemini_clean_name = gemini_model_name.split("/")[-1].replace(":", "_")  # e.g., gemini-1.5-flash-001
else:
    print("No Gemini model available. Skipping Gemini.")
    gemini_model = None
    gemini_clean_name = "gemini_unavailable"

# Flan model name
flan_model_name = "flan-t5-large"

# Output paths with MODEL NAME
GEMINI_OUT = os.path.join(VIDEO_DIR, f"EXECUTIVE_SUMMARY_{gemini_clean_name}.txt")
FLAN_OUT   = os.path.join(VIDEO_DIR, f"EXECUTIVE_SUMMARY_{flan_model_name}.txt")

# -------------------------------------------------
# 3. LOAD & SPLIT
# -------------------------------------------------
with open(TRANSCRIPT, "r", encoding="utf-8") as f:
    full_text = f.read()
print(f"\nTranscript: {len(full_text):,} chars")

CHUNK_SIZE = 4000
chunks = textwrap.wrap(full_text, CHUNK_SIZE)
print(f"Split: {len(chunks)} chunks\n")

# -------------------------------------------------
# 4. GEMINI SUMMARISATION
# -------------------------------------------------
if gemini_model:
    print(f"GEMINI ({gemini_clean_name}) summarising...")
    gemini_sums = []
    for i, c in enumerate(chunks, 1):
        print(f"  Chunk {i}/{len(chunks)}")
        prompt = f"Summarise in 3-5 bullets. Keep revenue, EBITDA, PAT, growth, guidance:\n\n{c}"
        try:
            resp = gemini_model.generate_content(prompt)
            gemini_sums.append(resp.text.strip())
        except Exception as e:
            gemini_sums.append(f"[ERROR: {e}]")
        time.sleep(1.5)

    final_prompt = "Combine into ONE executive summary (max 300 words):\n\n" + "\n\n".join(gemini_sums)
    print("Final Gemini summary...")
    try:
        final_gemini = gemini_model.generate_content(final_prompt).text.strip()
    except Exception as e:
        final_gemini = f"[FINAL ERROR: {e}]"
else:
    final_gemini = "[GEMINI UNAVAILABLE]"

# Save Gemini
with open(GEMINI_OUT, "w", encoding="utf-8") as f:
    f.write(f"HAPPIEST MINDS Q2 FY26 – {gemini_clean_name.upper()}\n")
    f.write("="*70 + "\n\n" + final_gemini)
    f.write(f"\n\nGenerated: {time.strftime('%Y-%m-%d %H:%M')} UTC")
print(f"GEMINI SAVED: {GEMINI_OUT}")

# -------------------------------------------------
# 5. FLAN-T5 SUMMARISATION
# -------------------------------------------------
print(f"\nFLAN-T5 ({flan_model_name}) summarising...")
flan_summarizer = pipeline("summarization", model="google/flan-t5-large", device=0 if torch.cuda.is_available() else -1)

flan_sums = []
for i, c in enumerate(chunks, 1):
    print(f"  Chunk {i}/{len(chunks)}")
    try:
        out = flan_summarizer(c[:1000], max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
        flan_sums.append(out)
    except Exception as e:
        flan_sums.append(f"[ERROR: {e}]")
    time.sleep(0.3)

final_flan = "\n\n".join(flan_sums)

# Save Flan
with open(FLAN_OUT, "w", encoding="utf-8") as f:
    f.write(f"HAPPIEST MINDS Q2 FY26 – {flan_model_name.upper()}\n")
    f.write("="*70 + "\n\n" + final_flan)
    f.write(f"\n\nGenerated: {time.strftime('%Y-%m-%d %H:%M')} UTC")
print(f"FLAN SAVED: {FLAN_OUT}")

# -------------------------------------------------
# 6. PREVIEW
# -------------------------------------------------
print("\n" + "="*90)
print(f"GEMINI ({gemini_clean_name}) PREVIEW:")
print("="*90)
print(final_gemini[:1200] + ("..." if len(final_gemini) > 1200 else ""))

print("\n" + "="*90)
print(f"FLAN ({flan_model_name}) PREVIEW:")
print("="*90)
print(final_flan[:1200] + ("..." if len(final_flan) > 1200 else ""))

print(f"\nDONE! Two model-named files saved in:\n{VIDEO_DIR}")

Detecting available Gemini model...
Using Gemini: models/gemini-2.5-pro-preview-03-25

Transcript: 52,906 chars
Split: 14 chunks

GEMINI (gemini-2.5-pro-preview-03-25) summarising...
  Chunk 1/14




  Chunk 2/14




  Chunk 3/14




  Chunk 4/14




  Chunk 5/14




  Chunk 6/14




  Chunk 7/14




  Chunk 8/14




  Chunk 9/14




  Chunk 10/14




  Chunk 11/14




  Chunk 12/14




  Chunk 13/14




  Chunk 14/14




Final Gemini summary...




GEMINI SAVED: /content/drive/My Drive/DREAM-PROJECT/YOUTUBE/FILES/XEV2Owi-7B4/EXECUTIVE_SUMMARY_gemini-2.5-pro-preview-03-25.txt

FLAN-T5 (flan-t5-large) summarising...


Device set to use cpu


  Chunk 1/14


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Chunk 2/14


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Chunk 3/14


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Chunk 4/14


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Chunk 5/14


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Chunk 6/14


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Chunk 7/14


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Chunk 8/14


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Chunk 9/14


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Chunk 10/14


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Chunk 11/14


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Chunk 12/14


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Chunk 13/14


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


  Chunk 14/14


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


FLAN SAVED: /content/drive/My Drive/DREAM-PROJECT/YOUTUBE/FILES/XEV2Owi-7B4/EXECUTIVE_SUMMARY_flan-t5-large.txt

GEMINI (gemini-2.5-pro-preview-03-25) PREVIEW:
[FINAL ERROR: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro-preview-03-25:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 0
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_input_token_count, limit: 0
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_inpu