## Setup & Configuration

In [None]:
# Install required packages (run once)
# !pip install google-api-python-client google-generativeai pandas yt-dlp transformers torch python-docx openpyxl tqdm matplotlib seaborn

In [None]:
import os
import sys

# Add src directory to path
sys.path.insert(0, os.path.join(os.getcwd(), 'src'))

# Load environment variables
API_KEY = os.getenv('GOOGLE_API_KEY')

if not API_KEY:
    raise ValueError("Please set the GOOGLE_API_KEY environment variable")

print("‚úì API Key loaded successfully")

In [None]:
from config import PipelineConfig, CarModel, SCENIC_CONFIG, KOLEOS_CONFIG, TORRES_CONFIG
from pipeline import YouTubeAnalysisPipeline

# Initialize configuration
config = PipelineConfig(
    google_api_key=API_KEY,
    max_search_results=50,
    published_after="2024-04-01T00:00:00Z",
    max_comments_per_video=100,
    output_dir="output",
)

# Initialize pipeline
pipeline = YouTubeAnalysisPipeline(config)

print("‚úì Pipeline initialized")

## Select Car Model to Analyze

Choose from predefined models or create a custom configuration.

In [None]:
# Option 1: Use predefined configuration
car_model = SCENIC_CONFIG  # Renault Scenic E-Tech

# Option 2: Create custom configuration
# car_model = CarModel(
#     company="Î•¥ÎÖ∏",
#     model="Scenic E-Tech",
#     search_queries=[
#         "Î•¥ÎÖ∏ ÏÑ∏Îãâ E-Tech",
#         "ÏÑ∏Îãâ Ï†ÑÍ∏∞Ï∞® ÏãúÏäπÍ∏∞",
#         "Î•¥ÎÖ∏ ÏÑ∏Îãâ Ï†ÑÍ∏∞Ï∞® Î¶¨Î∑∞",
#     ]
# )

print(f"Selected: {car_model.company} {car_model.model}")
print(f"Search queries: {car_model.search_queries}")

---
## Stage 1-2: Video Discovery & Comment Collection

Search YouTube for relevant videos and collect comments.

**Expected time:** ~30 seconds for 50 videos

In [None]:
%%time

# Discover videos and collect comments
videos_df, comments_df = pipeline.run_discovery(car_model)

print(f"\nüìä Results:")
print(f"   Videos found: {len(videos_df)}")
print(f"   Comments collected: {len(comments_df)}")

In [None]:
# Preview discovered videos (top 10 by views)
videos_df[['Title', 'Channel Title', 'Views', 'Likes', 'Duration']].head(10)

In [None]:
# Optional: Filter videos by title pattern
# Uncomment and modify the regex pattern as needed

# filtered_videos = videos_df[videos_df['Title'].str.contains(r"(?=.*Î•¥ÎÖ∏)(?=.*ÏÑ∏Îãâ)", na=False, regex=True)]
# print(f"Filtered to {len(filtered_videos)} videos")
# filtered_videos.head()

In [None]:
# Preview comments
comments_df.head(10)

In [None]:
# Save comments to CSV for backup
comments_df.to_csv(f"{car_model.identifier}_comments.csv", index=False)
print(f"‚úì Comments saved to {car_model.identifier}_comments.csv")

---
## Stage 3: Video Transcription (Optional)

Download audio and transcribe using Whisper.

**Expected time:** ~3-4 minutes per video (10 videos ‚âà 30-40 minutes)

‚ö†Ô∏è **Note:** This stage requires significant compute resources and time. You can skip it if you only want to analyze comments.

In [None]:
# Configuration for transcription
MAX_VIDEOS_TO_TRANSCRIBE = 10  # Limit to reduce processing time
SKIP_TRANSCRIPTION = True  # Set to False to enable transcription

if not SKIP_TRANSCRIPTION:
    print(f"Will transcribe up to {MAX_VIDEOS_TO_TRANSCRIBE} videos")
else:
    print("Transcription skipped - analysis will be based on comments only")

In [None]:
%%time

if not SKIP_TRANSCRIPTION:
    transcriptions = pipeline.run_transcription(
        car_model,
        max_videos=MAX_VIDEOS_TO_TRANSCRIBE,
        whisper_model="large-v3"  # Options: tiny, base, small, medium, large-v3
    )
    print(f"\n‚úì Successfully transcribed {len(transcriptions)} videos")
else:
    transcriptions = {}
    print("Transcription skipped")

---
## Stage 4: AI-Powered Analysis

Analyze transcripts and comments using Google Gemini for:
- Sentiment analysis
- Key strengths & weaknesses
- Competitor mentions
- User persona generation

**Expected time:** ~1-2 seconds per video

In [None]:
%%time

# Run analysis
video_analyses, comment_analyses = pipeline.run_analysis(car_model)

print(f"\nüìä Analysis Results:")
print(f"   Video analyses: {len(video_analyses)}")
print(f"   Comment analyses: {len(comment_analyses)}")

In [None]:
# Preview analysis results
from analysis import analysis_to_dataframe

if video_analyses:
    analysis_df = analysis_to_dataframe(video_analyses)
    display(analysis_df[['Video URL', 'Overall Sentiment', 'Sentiment Score', 'Key Strengths', 'Key Weaknesses']].head())
else:
    print("No video analyses available (transcription was skipped)")

---
## Stage 5: Report Generation

Generate comprehensive reports in multiple formats:
- Word document with executive summary
- Excel file with detailed data
- CSV export of comments

In [None]:
%%time

# Generate reports
output_files = pipeline.run_reporting(
    car_model,
    generate_word=True,
    generate_excel=True
)

print("\nüìÅ Generated Files:")
for name, path in output_files.items():
    print(f"   {name}: {path}")

---
## Alternative: Run Full Pipeline in One Command

Use this for automated end-to-end processing.

In [None]:
# # Run full pipeline (uncomment to use)
# output_files = pipeline.run_full_pipeline(
#     car_model,
#     max_videos_to_transcribe=10,
#     skip_transcription=True  # Set to False to include transcription
# )

---
## Multi-Model Comparison (Optional)

Compare sentiment analysis across multiple car models.

In [None]:
# # Analyze multiple car models
# from config import KOLEOS_CONFIG, SORENTO_CONFIG, SANTAFE_CONFIG
# 
# models_to_compare = [SCENIC_CONFIG, KOLEOS_CONFIG, SORENTO_CONFIG]
# 
# all_results = {}
# for model in models_to_compare:
#     pipeline.run_full_pipeline(model, skip_transcription=True)
#     all_results[model.identifier] = pipeline.results[model.identifier]

In [None]:
# # Generate comparison visualization
# from reports import MultiModelReportGenerator
# 
# multi_report = MultiModelReportGenerator(pipeline.gemini_client, config.output_dir)
# 
# # Create comparison DataFrame
# model_analyses = {}
# for model_id, results in all_results.items():
#     if 'video_analyses' in results:
#         model_analyses[model_id] = analysis_to_dataframe(results['video_analyses'])
# 
# # Generate sentiment comparison
# sentiment_comparison = multi_report.generate_sentiment_comparison(model_analyses)
# display(sentiment_comparison)
# 
# # Create visualization
# multi_report.visualize_sentiment(sentiment_comparison)

---
## Summary

Pipeline execution complete! Check the `output/` directory for generated reports.

In [None]:
# List generated files
import os
from pathlib import Path

output_dir = Path("output")
if output_dir.exists():
    print("üìÅ Output files:")
    for f in sorted(output_dir.iterdir()):
        size = f.stat().st_size / 1024  # KB
        print(f"   {f.name} ({size:.1f} KB)")
else:
    print("No output files generated yet.")