In [None]:
import os
import re
import time
import warnings
# Suppress urllib3 OpenSSL warning (common on macOS with LibreSSL)
# This warning appears when urllib3 v2 is used with LibreSSL instead of OpenSSL
# It's safe to ignore as it doesn't affect functionality
warnings.filterwarnings('ignore', message='.*urllib3.*OpenSSL.*')
warnings.filterwarnings('ignore', category=UserWarning, module='urllib3')

import requests
from bs4 import BeautifulSoup
import yt_dlp  # More up-to-date alternative to youtube_dl

def normalize_channel_url(url):
    """
    Normalizes different YouTube channel URL formats to a standard format.
    Handles:
    - https://www.youtube.com/channel/CHANNEL_ID
    - https://www.youtube.com/@username
    - https://www.youtube.com/user/USERNAME
    - https://www.youtube.com/c/CHANNEL_NAME
    Returns the normalized URL pointing to the channel's videos page.
    """
    url = url.strip()
    
    # If already a channel ID URL with /videos, return as is
    if '/channel/' in url and '/videos' in url:
        return url
    
    # If it's a channel ID URL without /videos, add it
    channel_id_match = re.search(r'/channel/([^/?]+)', url)
    if channel_id_match:
        channel_id = channel_id_match.group(1)
        return f"https://www.youtube.com/channel/{channel_id}/videos"
    
    # If it's an @username URL, yt-dlp can handle it directly
    if '/@' in url:
        # Remove trailing slash and add /videos if not present
        url = url.rstrip('/')
        if not url.endswith('/videos'):
            return f"{url}/videos"
        return url
    
    # For other formats, try to extract channel ID by crawling
    # But first, let yt-dlp try to handle it directly
    return url

def get_video_ids_and_channel_name(channel_url):
    """
    Uses yt-dlp to extract video IDs, URLs and the channel name from the YouTube channel.
    Handles various URL formats including @username, channel ID, etc.
    Returns a tuple: (channel_name, list_of_video_ids, list_of_video_urls)
    """
    # Normalize the URL
    normalized_url = normalize_channel_url(channel_url)
    
    # Ensure we're pointing to the videos page
    if not normalized_url.endswith('/videos'):
        if '/@' in normalized_url:
            normalized_url = f"{normalized_url.rstrip('/')}/videos"
        elif '/channel/' in normalized_url:
            normalized_url = f"{normalized_url.rstrip('/')}/videos"
    
    ydl_opts = {
        'extract_flat': True,      # Only fetch basic info (no download)
        'skip_download': True,     # Do not download videos
        'quiet': False,            # Set to False to see progress
        'ignoreerrors': True,
        'playlistend': None,       # Get all videos
    }
    video_ids = []
    video_urls = []
    channel_name = None
    
    print(f"Fetching channel info from: {normalized_url}")
    
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        try:
            info = ydl.extract_info(normalized_url, download=False)
        except Exception as e:
            print(f"Error extracting channel info: {e}")
            # Try alternative approach - extract channel ID first
            try:
                response = requests.get(normalized_url, timeout=10)
                soup = BeautifulSoup(response.text, 'html.parser')
                for link in soup.head.find_all("link"):
                    href = link.get("href", "")
                    match = re.search(r'/channel/([^/?]+)', href)
                    if match:
                        channel_id = match.group(1)
                        alt_url = f"https://www.youtube.com/channel/{channel_id}/videos"
                        print(f"Trying alternative URL: {alt_url}")
                        info = ydl.extract_info(alt_url, download=False)
                        break
                else:
                    raise
            except Exception as e2:
                print(f"Alternative method also failed: {e2}")
                raise ValueError(f"Could not extract channel information: {e}")
        
        if not info:
            raise ValueError("No channel information found")
        
        # Extract channel name
        channel_name = info.get("channel", info.get("uploader", info.get("title", "Channel")))
        if not channel_name or channel_name == "Channel":
            channel_name = info.get("title", "Channel")
        
        # Remove common suffixes
        for suffix in [" - Videos", " - YouTube"]:
            if channel_name.endswith(suffix):
                channel_name = channel_name[:-len(suffix)].strip()
        
        # Extract video IDs and URLs
        if 'entries' in info and info['entries']:
            for entry in info['entries']:
                if entry is not None:
                    video_id = entry.get('id') or entry.get('url', '').split('/')[-1]
                    if video_id and video_id not in video_ids:
                        video_ids.append(video_id)
                        video_url = f"https://www.youtube.com/watch?v={video_id}"
                        video_urls.append(video_url)
        elif 'id' in info:
            video_id = info['id']
            video_ids.append(video_id)
            video_url = f"https://www.youtube.com/watch?v={video_id}"
            video_urls.append(video_url)
        
        # If no videos found in entries, try to get from playlist
        if not video_ids and 'webpage_url' in info:
            print("No videos found in entries, trying playlist extraction...")
            try:
                playlist_info = ydl.extract_info(info['webpage_url'], download=False)
                if 'entries' in playlist_info:
                    for entry in playlist_info['entries']:
                        if entry and 'id' in entry:
                            video_id = entry['id']
                            video_ids.append(video_id)
                            video_url = f"https://www.youtube.com/watch?v={video_id}"
                            video_urls.append(video_url)
            except:
                pass
    
    if not video_ids:
        raise ValueError("No videos found for this channel")
    
    return channel_name, video_ids, video_urls

def sanitize_filename(filename, max_length=100):
    """
    Sanitiza um nome de arquivo removendo caracteres inválidos e limitando o tamanho.
    """
    # Remove caracteres inválidos para nomes de arquivo
    invalid_chars = '<>:"/\\|?*'
    for char in invalid_chars:
        filename = filename.replace(char, '')
    
    # Remove espaços extras e substitui por underscore
    filename = '_'.join(filename.split())
    
    # Limita o tamanho
    if len(filename) > max_length:
        filename = filename[:max_length]
    
    return filename.strip()

def get_video_title(video_id):
    """
    Obtém o título do vídeo usando yt-dlp.
    """
    try:
        ydl_opts = {
            'quiet': True,
            'no_warnings': True,
            'skip_download': True,
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(f"https://www.youtube.com/watch?v={video_id}", download=False)
            return info.get('title', '')
    except:
        return None

def download_transcript_from_tactiq(video_url, lang_code="pt"):
    """
    Downloads transcript from Tactiq API for a given YouTube video URL.
    Returns the transcript text as a string, or None if failed.
    """
    url = "https://tactiq-apps-prod.tactiq.io/transcript"
    
    headers = {
        'sec-ch-ua-platform': '"macOS"',
        'Referer': 'https://tactiq.io/',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
        'sec-ch-ua': '"Chromium";v="142", "Google Chrome";v="142", "Not_A Brand";v="99"',
        'content-type': 'application/json',
        'sec-ch-ua-mobile': '?0'
    }
    
    payload = {
        "videoUrl": video_url,
        "langCode": lang_code
    }
    
    try:
        response = requests.post(url, json=payload, headers=headers, timeout=30)
        response.raise_for_status()
        
        # Parse the response
        data = response.json()
        
        # Extract transcript text from response
        # The structure may vary, so we'll handle different possible formats
        if isinstance(data, dict):
            # Try common fields where transcript might be stored
            transcript_text = data.get('transcript') or data.get('text') or data.get('content') or data.get('data')
            if transcript_text:
                # If it's a string, return it directly
                if isinstance(transcript_text, str):
                    return transcript_text
                # If it's a list, join the items
                if isinstance(transcript_text, list):
                    return "\n".join(str(item) for item in transcript_text)
            
            # If transcript is in segments/items
            if 'segments' in data:
                text_lines = []
                for segment in data['segments']:
                    if isinstance(segment, dict):
                        text = segment.get('text') or segment.get('content') or segment.get('transcript')
                        if text:
                            text_lines.append(str(text))
                    elif isinstance(segment, str):
                        text_lines.append(segment)
                if text_lines:
                    return "\n".join(text_lines)
            
            # If transcript is in items array
            if 'items' in data:
                text_lines = []
                for item in data['items']:
                    if isinstance(item, dict):
                        text = item.get('text') or item.get('content') or item.get('transcript')
                        if text:
                            text_lines.append(str(text))
                    elif isinstance(item, str):
                        text_lines.append(item)
                if text_lines:
                    return "\n".join(text_lines)
            
            # Try to find any string value in the dict
            for key, value in data.items():
                if isinstance(value, str) and len(value) > 50:  # Likely a transcript if long string
                    return value
                elif isinstance(value, list) and len(value) > 0:
                    # Check if list contains text
                    text_lines = []
                    for item in value:
                        if isinstance(item, dict):
                            text = item.get('text') or item.get('content') or item.get('transcript')
                            if text:
                                text_lines.append(str(text))
                        elif isinstance(item, str):
                            text_lines.append(item)
                    if text_lines:
                        return "\n".join(text_lines)
        elif isinstance(data, str):
            return data
        elif isinstance(data, list):
            # If response is a list, try to extract text from items
            text_lines = []
            for item in data:
                if isinstance(item, dict):
                    text = item.get('text') or item.get('content') or item.get('transcript')
                    if text:
                        text_lines.append(str(text))
                elif isinstance(item, str):
                    text_lines.append(item)
            if text_lines:
                return "\n".join(text_lines)
        
        # If we can't parse it, return the JSON as string (for debugging)
        print(f"Warning: Unexpected response format. Returning JSON string.")
        return str(data)
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching transcript from Tactiq: {e}")
        return None
    except Exception as e:
        print(f"Error parsing transcript response: {e}")
        return None

def download_transcripts(video_ids, video_urls, start_index=0, output_dir="transcripts", lang_code="pt"):
    """
    For each video (starting from start_index), this function:
      - Uses Tactiq API to fetch the transcript
      - Saves the transcript as a plain text (.txt) file.
      
    After processing all videos, a combined file "all_transcripts.txt" is created in the output directory.
    A progress counter is printed to show how many transcripts have been downloaded.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    combined_transcripts = []  # To accumulate all transcript texts.
    total_videos = len(video_ids) - start_index
    downloaded_count = 0
    
    # Delay between requests to avoid rate limiting (in seconds)
    delay_between_requests = 1.5  # 1.5 seconds between requests
    delay_on_error = 5.0  # 5 seconds when encountering errors
    
    for idx, (video_id, video_url) in enumerate(zip(video_ids[start_index:], video_urls[start_index:]), start=start_index+1):
        # Check if old format file exists first (for backward compatibility)
        old_format_file = os.path.join(output_dir, f"{video_id}.txt")
        
        if os.path.exists(old_format_file):
            print(f"[{downloaded_count}/{total_videos}] Skipping video {idx} ({video_id}): already downloaded (old format)")
            # Read existing file and add to combined list
            try:
                with open(old_format_file, 'r', encoding='utf-8') as f:
                    existing_text = f.read()
                combined_transcripts.append(f"==== Video ID: {video_id} | Título: (arquivo antigo) ====\n{existing_text}\n")
                downloaded_count += 1
            except:
                pass
            continue
        
        # Get video title (with small delay to avoid rate limiting)
        time.sleep(0.5)  # Small delay before getting title
        video_title = get_video_title(video_id)
        if video_title:
            sanitized_title = sanitize_filename(video_title)
            filename = f"{sanitized_title}_{video_id}.txt"
        else:
            filename = f"{video_id}.txt"
        
        output_file = os.path.join(output_dir, filename)
        
        # Check if transcript file already exists (allows resuming)
        if os.path.exists(output_file):
            print(f"[{downloaded_count}/{total_videos}] Skipping video {idx} ({video_id}): already downloaded")
            # Read existing file and add to combined list
            try:
                with open(output_file, 'r', encoding='utf-8') as f:
                    existing_text = f.read()
                title_display = video_title if video_title else "Sem título"
                combined_transcripts.append(f"==== Video ID: {video_id} | Título: {title_display} (Language: {lang_code}) ====\n{existing_text}\n")
                downloaded_count += 1
            except:
                pass
            continue
        
        try:
            # Add delay to avoid rate limiting
            time.sleep(delay_between_requests)
            
            # Download transcript using Tactiq API
            transcript_text = download_transcript_from_tactiq(video_url, lang_code=lang_code)
            
            if transcript_text:
                # Save individual transcript file.
                with open(output_file, 'w', encoding='utf-8') as f:
                    f.write(transcript_text)

                # Append to the combined transcripts list (with a header).
                title_display = video_title if video_title else "Sem título"
                combined_transcripts.append(f"==== Video ID: {video_id} | Título: {title_display} | Language: {lang_code} ====\n{transcript_text}\n")
                downloaded_count += 1
                title_short = video_title[:50] + "..." if video_title and len(video_title) > 50 else (video_title or "Sem título")
                print(f"[{downloaded_count}/{total_videos}] ✓ Downloaded: {title_short} ({video_id}) - Language: {lang_code}")
            else:
                raise Exception("No transcript returned from API")
                
        except Exception as e:
            error_msg = str(e)
            # Shorten error messages
            if "blocking requests" in error_msg or "IP" in error_msg:
                error_msg = "IP bloqueado - aguardando antes de continuar..."
                time.sleep(delay_on_error)
            elif "No transcript" in error_msg or "transcript" in error_msg.lower():
                error_msg = "Transcript não disponível"
            else:
                # Truncate very long error messages
                if len(error_msg) > 150:
                    error_msg = error_msg[:150] + "..."
            
            print(f"[{downloaded_count}/{total_videos}] ✗ Erro no vídeo {idx} ({video_id}): {error_msg}")

    # Write all transcripts into one combined file.
    combined_file = os.path.join(output_dir, "all_transcripts.txt")
    with open(combined_file, 'w', encoding='utf-8') as cf:
        cf.write("\n\n".join(combined_transcripts))
    print(f"Combined transcript file created: {combined_file}")

# ============================================================================
# CONFIGURATION - Configure as variáveis antes de executar
# ============================================================================
# ⚠️ OBRIGATÓRIO: Cole a URL do canal do YouTube abaixo
# 
# COMO FAZER:
# 1. Vá até o canal no YouTube
# 2. Copie a URL completa da barra de endereços
# 3. Cole aqui entre as aspas ""
#
# Exemplos de URLs válidas:
#   input_url = "https://www.youtube.com/@nome_do_canal"
#   input_url = "https://www.youtube.com/channel/UCxxxxxxxxxxxxxxxxxxxxxx"
#   input_url = "https://www.youtube.com/user/nome_usuario"
#   input_url = "https://www.youtube.com/c/nome_do_canal"
#
# ⬇️ COLE SUA URL AQUI ⬇️
input_url = "https://www.youtube.com/@samuelmeller"  # ⬅️ COLE A URL DO CANAL AQUI (substitua as aspas vazias "")

# Optional: Starting video index (1-based, so 1 = first video, 2 = second video, etc.)
# Set to None to start from the beginning, or enter a number like 1, 2, 3, etc.
start_index_input = None  # Set to None to start from beginning, or enter a number like 1, 2, 3, etc.

# ============================================================================
# MAIN EXECUTION
# ============================================================================
if not input_url:
    raise ValueError(
        "❌ ERRO: Você precisa definir a URL do canal!\n\n"
        "COMO FAZER:\n"
        "1. Encontre a linha que diz: input_url = \"\"\n"
        "2. Cole a URL do canal entre as aspas\n"
        "3. Exemplo: input_url = \"https://www.youtube.com/@nome_do_canal\"\n\n"
        "Procure a seção CONFIGURATION acima e substitua as aspas vazias pela URL do canal."
    )

print("=" * 60)
print("STEP 1: Extracting video IDs, URLs and channel name...")
print("=" * 60)
print(f"Processing channel URL: {input_url}")
channel_name, video_ids, video_urls = get_video_ids_and_channel_name(input_url)
if not video_ids or not video_urls:
    raise ValueError("No videos found for this channel.")

print(f"\nChannel Name: {channel_name}")
print(f"Found {len(video_ids)} videos.")
print(f"All video links collected successfully!\n")

# Display first few video URLs as confirmation
print("Sample video URLs (first 5):")
for i, url in enumerate(video_urls[:5], 1):
    print(f"  {i}. {url}")
if len(video_urls) > 5:
    print(f"  ... and {len(video_urls) - 5} more videos\n")

# Handle starting index
if start_index_input is None:
    start_index = 0
else:
    try:
        start_index = int(start_index_input) - 1
        if start_index < 0 or start_index >= len(video_ids):
            print("Invalid starting index. Starting from the first video.")
            start_index = 0
    except (ValueError, TypeError):
        print("Invalid input. Starting from the first video.")
        start_index = 0

# Create the output folder as transcripts/{channel_name}
output_folder = os.path.join("transcripts", channel_name)
print(f"Saving transcripts to folder: {output_folder}")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Save all video URLs to a file
urls_file = os.path.join(output_folder, "all_video_urls.txt")
print(f"\nSaving all video URLs to: {urls_file}")
with open(urls_file, 'w', encoding='utf-8') as f:
    f.write(f"Channel: {channel_name}\n")
    f.write(f"Total videos: {len(video_urls)}\n")
    f.write(f"Channel URL: {input_url}\n")
    f.write("=" * 80 + "\n\n")
    for i, url in enumerate(video_urls, 1):
        f.write(f"{i}. {url}\n")
print(f"✓ All {len(video_urls)} URLs saved to {urls_file}")

print("\n" + "=" * 60)
print(f"STEP 2: Downloading transcripts using Tactiq API...")
print(f"Starting from video {start_index + 1}...")
print("=" * 60)
download_transcripts(video_ids, video_urls, start_index=start_index, output_dir=output_folder, lang_code="pt")
print("\nDone.")
