In [None]:
import os
import pandas as pd
import requests
import pickle
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv(dotenv_path='../.env')

# Get the API key from environment variables
api_key = os.getenv("REACT_APP_TMDB_API_KEY")

def fetch_with_retry(url, retries=3, delay=5):
    """Fetches a URL with a simple retry mechanism."""
    for i in range(retries):
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an exception for bad status codes
            return response
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}. Retrying in {delay} seconds...")
            time.sleep(delay)
    raise Exception(f"Failed to fetch {url} after {retries} retries.")

def fetch_movie_data():
    """Fetches movie data from the TMDB API."""
    print("Fetching movie data...")
    # Using the "Top Rated" endpoint to get a good base of movies
    url = f'https://api.themoviedb.org/3/movie/top_rated?api_key={api_key}&language=en-US&page=1'
    response = fetch_with_retry(url)
    
    data = response.json()['results']
    
    # Fetch more pages to get a larger dataset (approx. 5000 movies)
    print("Fetching pages 2 to 250...")
    for i in range(2, 251): # 250 pages * 20 movies/page = 5000 movies
        print(f"Fetching page {i}...")
        url = f'https://api.themoviedb.org/3/movie/top_rated?api_key={api_key}&language=en-US&page={i}'
        try:
            response = fetch_with_retry(url)
            data.extend(response.json()['results'])
        except Exception as e:
            print(f"Warning: {e}. Skipping page {i}.")
        time.sleep(0.1) # Short delay to be polite to the API

    df = pd.DataFrame(data)
    print(f"Fetched {len(df)} movies.")
    return df

def fetch_credits_and_keywords(movie_id):
    """Fetches credits (cast, crew) and keywords for a single movie."""
    # Fetch credits
    credits_url = f'https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}'
    # Fetch keywords
    keywords_url = f'https://api.themoviedb.org/3/movie/{movie_id}/keywords?api_key={api_key}'

    try:
        credits_response = fetch_with_retry(credits_url)
        keywords_response = fetch_with_retry(keywords_url)
    except Exception as e:
        print(f"Warning: Could not fetch details for movie_id {movie_id}. {e}")
        return [], [], []

    # Extract director from crew
    crew = credits_response.json().get('crew', [])
    director = [i['name'] for i in crew if i['job'] == 'Director']
    
    # Extract top 3 cast members
    cast = [i['name'] for i in credits_response.json().get('cast', [])[:3]]
    
    # Extract all keywords
    keywords = [i['name'] for i in keywords_response.json().get('keywords', [])]
    
    return director, cast, keywords

def process_data(df):
    """Processes the movie data to create a 'tags' column for vectorization."""
    print("Processing data...")
    # We only need a few columns
    df = df[['id', 'title', 'overview', 'genre_ids']].copy()
    
    # For simplicity, we'll map genre_ids to genre names manually from TMDB's genre list
    genre_map = {28:'Action',12:'Adventure',16:'Animation',35:'Comedy',80:'Crime',99:'Documentary',18:'Drama',10751:'Family',14:'Fantasy',36:'History',27:'Horror',10402:'Music',9648:'Mystery',10749:'Romance',878:'Science Fiction',10770:'TV Movie',53:'Thriller',10752:'War',37:'Western'}
    df['genres'] = df['genre_ids'].apply(lambda ids: [genre_map.get(i, '') for i in ids])

    # Fetch credits and keywords for each movie with progress updates
    directors = []
    casts = []
    keywords_list = []
    total_movies = len(df)
    print(f"Fetching details for {total_movies} movies...")
    for index, row in df.iterrows():
        director, cast, keywords = fetch_credits_and_keywords(row['id'])
        directors.append(director)
        casts.append(cast)
        keywords_list.append(keywords)
        if (index + 1) % 500 == 0:
            print(f"  Processed {index + 1} / {total_movies} movies...")

    df['director'] = directors
    df['cast'] = casts
    df['keywords'] = keywords_list

    # Clean and combine features into a single 'tags' string
    df['overview'] = df['overview'].fillna('').apply(lambda x: x.split())
    df['genres'] = df['genres'].apply(lambda x: [i.replace(" ","") for i in x])
    df['director'] = df['director'].apply(lambda x: [i.replace(" ","") for i in x])
    df['cast'] = df['cast'].apply(lambda x: [i.replace(" ","") for i in x])
    df['keywords'] = df['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
    
    df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast'] + df['director']
    df['tags'] = df['tags'].apply(lambda x: " ".join(x))
    
    processed_df = df[['id', 'title', 'tags']].copy()
    print("Data processing complete.")
    return processed_df

def create_model(df):
    """Creates the TF-IDF model and cosine similarity matrix."""
    print("Creating model...")
    tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
    vectors = tfidf.fit_transform(df['tags']).toarray()
    
    similarity = cosine_similarity(vectors)
    print("Model creation complete.")
    return similarity

def main():
    """Main function to run the data fetching, processing, and model creation."""
    try:
        movie_df = fetch_movie_data()
        processed_df = process_data(movie_df)
        similarity_matrix = create_model(processed_df)
        
        # Save the processed dataframe and similarity matrix
        pickle.dump(processed_df.to_dict(), open('movies.pkl', 'wb'))
        pickle.dump(similarity_matrix, open('similarity.pkl', 'wb'))
        
        print("\nSuccessfully created and saved 'movies.pkl' and 'similarity.pkl'")
        print("You can now run the Flask server using 'python app.py'")
        
    except Exception as e:
        print(f"An error occurred: {e}")
        print("Please ensure your TMDB API key is correct in the .env file.")

if __name__ == '__main__':
    main()
