In [None]:
import os
import re
import time
import warnings
from datetime import datetime
# Suppress urllib3 OpenSSL warning (common on macOS with LibreSSL)
# This warning appears when urllib3 v2 is used with LibreSSL instead of OpenSSL
# It's safe to ignore as it doesn't affect functionality
warnings.filterwarnings('ignore', message='.*urllib3.*OpenSSL.*')
warnings.filterwarnings('ignore', category=UserWarning, module='urllib3')

import requests
import yt_dlp  # More up-to-date alternative to youtube_dl

def extract_video_id_from_url(url):
    """
    Extracts video ID from various YouTube URL formats.
    Handles:
    - https://www.youtube.com/watch?v=VIDEO_ID
    - https://youtu.be/VIDEO_ID
    - https://www.youtube.com/watch?v=VIDEO_ID&list=...
    Returns the video ID or None if not found.
    """
    url = url.strip()
    
    # Pattern for watch?v=VIDEO_ID
    match = re.search(r'(?:watch\?v=|youtu\.be/)([a-zA-Z0-9_-]{11})', url)
    if match:
        return match.group(1)
    
    return None

def read_video_urls_from_file(file_path="list.txt"):
    """
    Reads video URLs from a text file (one URL per line).
    Extracts video IDs and returns lists of video IDs and URLs.
    Returns a tuple: (list_of_video_ids, list_of_video_urls)
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Arquivo não encontrado: {file_path}")
    
    video_ids = []
    video_urls = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line or line.startswith('#'):  # Skip empty lines and comments
                continue
            
            # Extract video ID from URL
            video_id = extract_video_id_from_url(line)
            if video_id:
                if video_id not in video_ids:  # Avoid duplicates
                    video_ids.append(video_id)
                    # Normalize URL to standard format
                    video_url = f"https://www.youtube.com/watch?v={video_id}"
                    video_urls.append(video_url)
            else:
                print(f"⚠️  Aviso: Linha {line_num} não contém uma URL válida do YouTube: {line}")
    
    if not video_ids:
        raise ValueError(f"Nenhuma URL válida encontrada no arquivo {file_path}")
    
    return video_ids, video_urls

def sanitize_filename(filename, max_length=100):
    """
    Sanitiza um nome de arquivo removendo caracteres inválidos e limitando o tamanho.
    """
    # Remove caracteres inválidos para nomes de arquivo
    invalid_chars = '<>:"/\\|?*'
    for char in invalid_chars:
        filename = filename.replace(char, '')
    
    # Remove espaços extras e substitui por underscore
    filename = '_'.join(filename.split())
    
    # Limita o tamanho
    if len(filename) > max_length:
        filename = filename[:max_length]
    
    return filename.strip()

def get_video_title(video_id):
    """
    Obtém o título do vídeo usando yt-dlp.
    """
    try:
        ydl_opts = {
            'quiet': True,
            'no_warnings': True,
            'skip_download': True,
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(f"https://www.youtube.com/watch?v={video_id}", download=False)
            return info.get('title', '')
    except:
        return None

def download_transcript_from_tactiq(video_url, lang_code="pt"):
    """
    Downloads transcript from Tactiq API for a given YouTube video URL.
    Returns the transcript text as a string, or None if failed.
    """
    url = "https://tactiq-apps-prod.tactiq.io/transcript"
    
    headers = {
        'sec-ch-ua-platform': '"macOS"',
        'Referer': 'https://tactiq.io/',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
        'sec-ch-ua': '"Chromium";v="142", "Google Chrome";v="142", "Not_A Brand";v="99"',
        'content-type': 'application/json',
        'sec-ch-ua-mobile': '?0'
    }
    
    payload = {
        "videoUrl": video_url,
        "langCode": lang_code
    }
    
    try:
        response = requests.post(url, json=payload, headers=headers, timeout=30)
        response.raise_for_status()
        
        # Parse the response
        data = response.json()
        
        # Extract transcript text from response
        # The structure may vary, so we'll handle different possible formats
        if isinstance(data, dict):
            # Try common fields where transcript might be stored
            transcript_text = data.get('transcript') or data.get('text') or data.get('content') or data.get('data')
            if transcript_text:
                # If it's a string, return it directly
                if isinstance(transcript_text, str):
                    return transcript_text
                # If it's a list, join the items
                if isinstance(transcript_text, list):
                    return "\n".join(str(item) for item in transcript_text)
            
            # If transcript is in segments/items
            if 'segments' in data:
                text_lines = []
                for segment in data['segments']:
                    if isinstance(segment, dict):
                        text = segment.get('text') or segment.get('content') or segment.get('transcript')
                        if text:
                            text_lines.append(str(text))
                    elif isinstance(segment, str):
                        text_lines.append(segment)
                if text_lines:
                    return "\n".join(text_lines)
            
            # If transcript is in items array
            if 'items' in data:
                text_lines = []
                for item in data['items']:
                    if isinstance(item, dict):
                        text = item.get('text') or item.get('content') or item.get('transcript')
                        if text:
                            text_lines.append(str(text))
                    elif isinstance(item, str):
                        text_lines.append(item)
                if text_lines:
                    return "\n".join(text_lines)
            
            # Try to find any string value in the dict
            for key, value in data.items():
                if isinstance(value, str) and len(value) > 50:  # Likely a transcript if long string
                    return value
                elif isinstance(value, list) and len(value) > 0:
                    # Check if list contains text
                    text_lines = []
                    for item in value:
                        if isinstance(item, dict):
                            text = item.get('text') or item.get('content') or item.get('transcript')
                            if text:
                                text_lines.append(str(text))
                        elif isinstance(item, str):
                            text_lines.append(item)
                    if text_lines:
                        return "\n".join(text_lines)
        elif isinstance(data, str):
            return data
        elif isinstance(data, list):
            # If response is a list, try to extract text from items
            text_lines = []
            for item in data:
                if isinstance(item, dict):
                    text = item.get('text') or item.get('content') or item.get('transcript')
                    if text:
                        text_lines.append(str(text))
                elif isinstance(item, str):
                    text_lines.append(item)
            if text_lines:
                return "\n".join(text_lines)
        
        # If we can't parse it, return the JSON as string (for debugging)
        print(f"Warning: Unexpected response format. Returning JSON string.")
        return str(data)
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching transcript from Tactiq: {e}")
        return None
    except Exception as e:
        print(f"Error parsing transcript response: {e}")
        return None

def download_transcripts(video_ids, video_urls, start_index=0, output_dir="transcripts", lang_code="pt"):
    """
    For each video (starting from start_index), this function:
      - Uses Tactiq API to fetch the transcript
      - Saves the transcript as a plain text (.txt) file.
      
    After processing all videos, a combined file "all_transcripts.txt" is created in the output directory.
    A progress counter is printed to show how many transcripts have been downloaded.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    combined_transcripts = []  # To accumulate all transcript texts.
    total_videos = len(video_ids) - start_index
    total_videos_all = len(video_ids)  # Total de vídeos (para incluir no nome do arquivo)
    downloaded_count = 0  # Contador de vídeos baixados com sucesso
    processed_count = 0  # Contador de progresso total (sempre incrementa)
    error_count = 0  # Contador de erros
    
    # Create error log file
    error_log_file = os.path.join(output_dir, "errors.txt")
    with open(error_log_file, 'w', encoding='utf-8') as f:
        f.write(f"Arquivo de erros - Início: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Total de vídeos: {total_videos_all}\n")
        f.write(f"Índice inicial: {start_index + 1}\n")
        f.write("=" * 80 + "\n\n")
    
    # Delay between requests to avoid rate limiting (in seconds)
    delay_between_requests = 1.5  # 1.5 seconds between requests
    delay_on_error = 5.0  # 5 seconds when encountering errors
    
    for idx, (video_id, video_url) in enumerate(zip(video_ids[start_index:], video_urls[start_index:]), start=start_index+1):
        processed_count += 1  # Incrementa sempre, independente de sucesso ou erro
        
        # Check if old format file exists first (for backward compatibility)
        old_format_file = os.path.join(output_dir, f"{video_id}.txt")
        
        if os.path.exists(old_format_file):
            print(f"[{processed_count}/{total_videos}] Skipping video {idx} ({video_id}): already downloaded (old format)")
            # Read existing file and add to combined list
            try:
                with open(old_format_file, 'r', encoding='utf-8') as f:
                    existing_text = f.read()
                combined_transcripts.append(f"==== Video ID: {video_id} | Título: (arquivo antigo) ====\n{existing_text}\n")
                downloaded_count += 1
            except:
                pass
            continue
        
        # Get video title (with small delay to avoid rate limiting)
        time.sleep(0.5)  # Small delay before getting title
        video_title = get_video_title(video_id)
        
        # Format: n_do_video_n_total_de_videos (ex: 001_025, 5_25)
        # Use zero-padding for consistent sorting
        video_number_str = f"{idx:03d}"  # 001, 002, 003, etc.
        total_videos_str = f"{total_videos_all:03d}"  # 025, 100, etc.
        number_prefix = f"{video_number_str}_{total_videos_str}"
        
        if video_title:
            sanitized_title = sanitize_filename(video_title)
            filename = f"{number_prefix}_{sanitized_title}_{video_id}.txt"
        else:
            filename = f"{number_prefix}_{video_id}.txt"
        
        output_file = os.path.join(output_dir, filename)
        
        # Check if transcript file already exists (allows resuming)
        if os.path.exists(output_file):
            print(f"[{processed_count}/{total_videos}] Skipping video {idx} ({video_id}): already downloaded")
            # Read existing file and add to combined list
            try:
                with open(output_file, 'r', encoding='utf-8') as f:
                    existing_text = f.read()
                title_display = video_title if video_title else "Sem título"
                combined_transcripts.append(f"==== Video ID: {video_id} | Título: {title_display} (Language: {lang_code}) ====\n{existing_text}\n")
                downloaded_count += 1
            except:
                pass
            continue
        
        try:
            # Add delay to avoid rate limiting
            time.sleep(delay_between_requests)
            
            # Download transcript using Tactiq API
            transcript_text = download_transcript_from_tactiq(video_url, lang_code=lang_code)
            
            if transcript_text:
                # Save individual transcript file.
                with open(output_file, 'w', encoding='utf-8') as f:
                    f.write(transcript_text)

                # Append to the combined transcripts list (with a header).
                title_display = video_title if video_title else "Sem título"
                combined_transcripts.append(f"==== Video ID: {video_id} | Título: {title_display} | Language: {lang_code} ====\n{transcript_text}\n")
                downloaded_count += 1
                title_short = video_title[:50] + "..." if video_title and len(video_title) > 50 else (video_title or "Sem título")
                print(f"[{processed_count}/{total_videos}] ✓ Downloaded: {title_short} ({video_id}) - Language: {lang_code}")
            else:
                raise Exception("No transcript returned from API")
                
        except Exception as e:
            error_msg = str(e)
            # Shorten error messages for display
            if "blocking requests" in error_msg or "IP" in error_msg:
                error_msg_display = "IP bloqueado - aguardando antes de continuar..."
                time.sleep(delay_on_error)
            elif "No transcript" in error_msg or "transcript" in error_msg.lower():
                error_msg_display = "Transcript não disponível"
            else:
                # Truncate very long error messages for display
                if len(error_msg) > 150:
                    error_msg_display = error_msg[:150] + "..."
                else:
                    error_msg_display = error_msg
            
            # Log error to file (update in real-time)
            error_timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            with open(error_log_file, 'a', encoding='utf-8') as f:
                f.write(f"[{error_timestamp}] Vídeo {idx}/{total_videos_all}\n")
                f.write(f"Video ID: {video_id}\n")
                f.write(f"URL: {video_url}\n")
                if video_title:
                    f.write(f"Título: {video_title}\n")
                f.write(f"Erro: {error_msg}\n")
                f.write("-" * 80 + "\n\n")
                f.flush()  # Force write to disk immediately
            
            error_count += 1  # Increment error counter
            print(f"[{processed_count}/{total_videos}] ✗ Erro no vídeo {idx} ({video_id}): {error_msg_display}")

    # Write all transcripts into one combined file.
    combined_file = os.path.join(output_dir, "all_transcripts.txt")
    with open(combined_file, 'w', encoding='utf-8') as cf:
        cf.write("\n\n".join(combined_transcripts))
    print(f"Combined transcript file created: {combined_file}")
    
    # Add summary to error log file
    with open(error_log_file, 'a', encoding='utf-8') as f:
        f.write("=" * 80 + "\n")
        f.write(f"RESUMO\n")
        f.write(f"Fim: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Total de vídeos processados: {total_videos}\n")
        f.write(f"Vídeos baixados com sucesso: {downloaded_count}\n")
        f.write(f"Vídeos com erro: {error_count}\n")
        f.write("=" * 80 + "\n")
    
    if error_count > 0:
        print(f"\n⚠️  {error_count} vídeo(s) com erro. Verifique o arquivo: {error_log_file}")
    else:
        print(f"\n✓ Todos os vídeos foram processados com sucesso!")

# ============================================================================
# CONFIGURATION - Configure as variáveis antes de executar
# ============================================================================
# ⚠️ OBRIGATÓRIO: Especifique o caminho do arquivo list.txt
# 
# COMO FAZER:
# 1. Crie um arquivo chamado "list.txt" no mesmo diretório deste script
# 2. Adicione uma URL de vídeo do YouTube por linha
# 3. Exemplos de URLs válidas:
#    https://www.youtube.com/watch?v=VIDEO_ID
#    https://youtu.be/VIDEO_ID
#    https://www.youtube.com/watch?v=VIDEO_ID&list=...
#
# ⬇️ CONFIGURE O CAMINHO DO ARQUIVO AQUI ⬇️
list_file_path = "list.txt"  # ⬅️ Caminho para o arquivo list.txt

# Optional: Starting video index (1-based, so 1 = first video, 2 = second video, etc.)
# Set to None to start from the beginning, or enter a number like 1, 2, 3, etc.
start_index_input = None  # Set to None to start from beginning, or enter a number like 1, 2, 3, etc.

# Optional: Output folder name (default: "transcripts")
output_folder_name = "transcripts"  # Pasta onde as transcrições serão salvas

# Language code for transcripts (default: "pt" for Portuguese)
lang_code = "pt"  # Código do idioma (pt, en, es, etc.)

# ============================================================================
# MAIN EXECUTION
# ============================================================================
if not list_file_path:
    raise ValueError(
        "❌ ERRO: Você precisa especificar o caminho do arquivo list.txt!\n\n"
        "COMO FAZER:\n"
        "1. Crie um arquivo 'list.txt' com URLs de vídeos do YouTube (uma por linha)\n"
        "2. Configure a variável list_file_path acima com o caminho do arquivo\n"
        "3. Exemplo: list_file_path = 'list.txt'\n\n"
        "Procure a seção CONFIGURATION acima e configure o caminho do arquivo."
    )

print("=" * 60)
print("STEP 1: Reading video URLs from list.txt...")
print("=" * 60)
print(f"Reading from file: {list_file_path}")

try:
    video_ids, video_urls = read_video_urls_from_file(list_file_path)
except FileNotFoundError as e:
    raise FileNotFoundError(
        f"❌ ERRO: {e}\n\n"
        "COMO RESOLVER:\n"
        "1. Crie um arquivo 'list.txt' no mesmo diretório deste script\n"
        "2. Adicione URLs de vídeos do YouTube, uma por linha\n"
        "3. Exemplo de conteúdo do list.txt:\n"
        "   https://www.youtube.com/watch?v=VIDEO_ID_1\n"
        "   https://www.youtube.com/watch?v=VIDEO_ID_2\n"
    )

if not video_ids or not video_urls:
    raise ValueError("Nenhuma URL válida encontrada no arquivo list.txt.")

print(f"\n✓ Found {len(video_ids)} video URLs in the file.")
print(f"All video links loaded successfully!\n")

# Display first few video URLs as confirmation
print("Sample video URLs (first 5):")
for i, url in enumerate(video_urls[:5], 1):
    print(f"  {i}. {url}")
if len(video_urls) > 5:
    print(f"  ... and {len(video_urls) - 5} more videos\n")

# Handle starting index
if start_index_input is None:
    start_index = 0
else:
    try:
        start_index = int(start_index_input) - 1
        if start_index < 0 or start_index >= len(video_ids):
            print("Invalid starting index. Starting from the first video.")
            start_index = 0
    except (ValueError, TypeError):
        print("Invalid input. Starting from the first video.")
        start_index = 0

# Create the output folder with timestamp
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_folder = f"{output_folder_name}_{timestamp}"
print(f"Saving transcripts to folder: {output_folder}")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

print("\n" + "=" * 60)
print(f"STEP 2: Downloading transcripts using Tactiq API...")
print(f"Starting from video {start_index + 1}...")
print(f"Language: {lang_code}")
print("=" * 60)
download_transcripts(video_ids, video_urls, start_index=start_index, output_dir=output_folder, lang_code=lang_code)
print("\n✓ Done!")
