In [1]:
import hdbscan
import joblib
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import pandas as pd
import seaborn as sns
import sqlite3

from collections import Counter
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix, silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [3]:
# Get data in memory
def load_data(db_path, query):
    logger.info("Loading data from database...")
    
    conn = sqlite3.connect(db_path)
    df = pd.read_sql_query(query, conn)
    conn.close()
    
    logger.info(f"Data loaded successfully with shape: {df.shape}")
    
    return df

In [4]:
# Preprocess data
def preprocess(df):
    # Fill 0's with null as we don't have any data for them
    # As there are too many nulls after this, so we will drop this columns at the end of this function
    df['budget'] = df['budget'].replace(0, np.nan)
    df['revenue'] = df['revenue'].replace(0, np.nan)
    
    # Fill missing release_year with year column
    df.loc[df['release_year'].isna(), 'release_year'] = df.loc[df['release_year'].isna(), 'year']
    
    # Drop rows those are still missing release_year
    df = df.dropna(subset=['release_year', 'year']).reset_index(drop=True)
    
    # Cast data types
    df['release_year'] = df['release_year'].astype(int)
    df['year'] = df['year'].astype(int)
    
    # Strip white spaced labelled columns
    for col in ['genres', 'directors', 'stars', 'production_house']:
        df[col] = df[col].astype(str).str.strip()
    
    # Drop duplicates
    df.drop_duplicates(subset=['movieId'], inplace=True)

    # Take average of IMDb and TMDb ratings
    df['rating'] = df[['imdb_rating', 'tmdb_vote_average']].mean(axis=1)
    
    # Take average of IMDb and TMDb votes
    df['votes'] = df[['imdb_votes', 'tmdb_votes']].mean(axis=1)
    
    # Drop old columns
    # imdb_rating, tmdb_vote_average, imdb_votes, tmdb_votes are reducndant now, as we have new columns for them
    # budget, revenue contains losts of nulls. so useless
    # year and language are also redundant
    df.drop(columns=['imdb_rating', 'tmdb_vote_average', 'imdb_votes', 
                     'tmdb_votes', 'budget', 'revenue', 'year', 'language'], inplace=True)

    # rename columns to better operation moving ahead
    df.rename(columns={
        'movie_name': 'title',
        'imdb_runtime': 'runtime',
        'release_year': 'year',
        'production_house': 'productions'
    }, inplace=True)

    
    # Final check
    print("Data cleaned successfully.")
    print("Remaining nulls:\n", df.isna().sum())

    return df

In [5]:
# Perform feature engineering before training models
def feature_engineering(processed_df):
    df = processed_df

    print(df.shape)

    ### useb below code to check distinct actors or directors in the dataset

    # # Convert comma-separated strings to lists
    # df['genres'] = df['genres'].apply(lambda x: x.split(','))
    # df['directors'] = df['directors'].apply(lambda x: x.split(','))
    # df['stars'] = df['stars'].apply(lambda x: x.split(','))

    # # Ensure lists are already split
    # df['directors'] = df['directors'].apply(lambda x: [d.strip() for d in x] if isinstance(x, list) else x.split(','))
    # df['stars'] = df['stars'].apply(lambda x: [s.strip() for s in x] if isinstance(x, list) else x.split(','))
    
    # # Flatten and get unique values
    # unique_directors = set(director for directors in df['directors'] for director in directors)
    # unique_stars = set(star for stars in df['stars'] for star in stars)
    
    # print("Total distinct directors:", len(unique_directors))
    # print("Total distinct stars:", len(unique_stars))

    # Backup raw columns for UI filtering
    df['genres_raw'] = df['genres'].apply(lambda x: ','.join(x) if isinstance(x, list) else x)
    df['directors_raw'] = df['directors'].apply(lambda x: ','.join(x) if isinstance(x, list) else x)
    df['stars_raw'] = df['stars'].apply(lambda x: ','.join(x) if isinstance(x, list) else x)
    
    # Convert strings to lists
    df['genres'] = df['genres'].apply(lambda x: [g.strip() for g in x.split(',')])
    df['directors'] = df['directors'].apply(lambda x: [d.strip() for d in x.split(',')])
    df['stars'] = df['stars'].apply(lambda x: [s.strip() for s in x.split(',')])
    
    # Encode genres... encode all of them as only 24 distinct genres are there
    mlb_genre = MultiLabelBinarizer()
    genre_encoded = mlb_genre.fit_transform(df['genres'])
    df_genre = pd.DataFrame(genre_encoded, columns=[f"genre_{g}" for g in mlb_genre.classes_])
    df = pd.concat([df, df_genre], axis=1)
    
    # Select top-K directors and stars... distinct count of stars is around 40k and for directors it's around 20k
    # We can't have such a sparse matrix, on the other hand most likely the popular one will have influence on movie
    # So consider only top ones.
    TOP_DIRECTORS = 50
    TOP_STARS = 100
    
    # Count frequency
    director_counts = Counter(d for directors in df['directors'] for d in directors)
    star_counts = Counter(s for stars in df['stars'] for s in stars)

    # Get top directors and stars
    top_directors = set([name for name, _ in director_counts.most_common(TOP_DIRECTORS)])
    top_stars = set([name for name, _ in star_counts.most_common(TOP_STARS)])
    
    # Encode top directors
    director_feature_dict = {}
    for director in top_directors:
        key = f'director_{director.lower().replace(" ", "_")}'
        director_feature_dict[key] = df['directors'].apply(lambda lst: int(director in lst))
    
    df_directors = pd.DataFrame(director_feature_dict)
    df = pd.concat([df, df_directors], axis=1)
    df['has_top_director'] = df_directors.max(axis=1)
    
    # Encode top stars
    star_feature_dict = {}
    for star in top_stars:
        key = f'star_{star.lower().replace(" ", "_")}'
        star_feature_dict[key] = df['stars'].apply(lambda lst: int(star in lst))
    
    df_stars = pd.DataFrame(star_feature_dict)
    df = pd.concat([df, df_stars], axis=1)
    df['has_top_star'] = df_stars.max(axis=1)

    
    print("Feature engineering complete. Final shape:", df.shape)
    # print(df.columns.tolist())
    # print(df.head())
    
    return df

In [6]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
def train_knn(df):
    # --- Step 1: Prepare features (use this if not done already) ---
    genre_cols = [col for col in df.columns if col.startswith('genre_')]
    director_cols = [col for col in df.columns if col.startswith('director_')]
    star_cols = [col for col in df.columns if col.startswith('star_')]
    numerical_cols = ['runtime', 'popularity', 'rating', 'votes']
    
    feature_cols = genre_cols + director_cols + star_cols + numerical_cols
    X = df[feature_cols].copy()
    
    # Normalize numeric features
    scaler = MinMaxScaler()
    X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.dropna(inplace=True)
    
    # Fit NearestNeighbors model
    knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
    knn_model.fit(X)
    
    # Genre Overlap Score Function ---
    def genre_overlap_score(base_idx, candidate_indices):
        base_genres = set(df.loc[base_idx, 'genres'])
        scores = []
    
        for idx in candidate_indices:
            rec_genres = set(df.loc[idx, 'genres'])
            overlap = len(base_genres & rec_genres) / len(base_genres | rec_genres)
            scores.append(overlap)
    
        return np.mean(scores)
    
    # Final Recommendation Function with Evaluation ---
    def get_similar_movies(movie_title, top_n=5):
        idx = df[df['title'].str.lower() == movie_title.lower()].index
        if len(idx) == 0:
            print("Movie not found.")
            return pd.DataFrame()
        idx = idx[0]

        # get all the feature values assoiated with search key
        query_vector = X.iloc[[idx]]
        distances, indices = knn_model.kneighbors(query_vector, n_neighbors=top_n + 1)
        result_indices = indices[0][1:]  # exclude the input movie
        result_distances = distances[0][1:]
    
        # Genre Overlap Score
        genre_score = genre_overlap_score(idx, result_indices)
    
        # Display distances and score
        print(f"🔍 Average Genre Overlap Score: {genre_score:.3f}")
        print("📏 Cosine Distances (lower is better):", result_distances)
    
        # Final result
        return df.iloc[result_indices][['title', 'rating', 'year', 'genres_raw', 'stars_raw', 'directors_raw']]

    # Test the knn model for movie name "The Godfather"
    get_similar_movies("The Godfather", top_n=50)

    # save model
    # Make sure a directory exists
    os.makedirs("models/knn_model_files", exist_ok=True)
    # Save the DataFrame with raw columns
    df.to_csv("models/knn_model_files/movies_dataset.csv", index=False)
    # Save the feature matrix
    X.to_csv("models/knn_model_files/feature_matrix.csv", index=False)
    # Save the scaler
    joblib.dump(scaler, "models/knn_model_files/scaler.pkl")
    # Save the KNN model
    joblib.dump(knn_model, "models/knn_model_files/knn_model.pkl")
    
    print("All models and data saved to 'models/knn_model_files/'")

In [8]:
# Train kmean clustering model
def train_kmean(df):
    # Get required deatures for the model
    genre_cols = [col for col in df.columns if col.startswith('genre_')]
    director_cols = [col for col in df.columns if col.startswith('director_')]
    star_cols = [col for col in df.columns if col.startswith('star_')]
    numerical_cols = ['runtime', 'popularity', 'rating', 'votes']
    
    feature_cols = genre_cols + director_cols + star_cols + numerical_cols
    X = df[feature_cols].copy()
    
    # Normalize numerical columns
    scaler = MinMaxScaler()
    X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.dropna(inplace=True)
    
    # Tune k using silhouette score
    inertias = []
    silhouette_scores = []
    k_range = range(2, 26)
    
    print("Tuning KMeans...")
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
        labels = kmeans.fit_predict(X)
    
        inertia = kmeans.inertia_
        inertias.append(inertia)
    
        score = silhouette_score(X, labels)
        silhouette_scores.append(score)
    
        print(f"k = {k} | inertia = {inertia:.2f} | silhouette = {score:.4f}")
    
    # Pick smallest k that is within 95% of best silhouette
    max_score = max(silhouette_scores)
    threshold = 0.95 * max_score
    for i, score in enumerate(silhouette_scores):
        if score >= threshold:
            best_k = k_range[i]
            best_score = score
            break
    
    # Final model
    final_kmeans = KMeans(n_clusters=best_k, random_state=42, n_init='auto')
    df['kmeans_cluster'] = final_kmeans.fit_predict(X)
    
    print(f"Best KMeans: k={best_k}, silhouette={best_score:.4f}")
    
    # Save models, labels, and feature matrix
    os.makedirs("models/kmeans_model_files", exist_ok=True)
    # Save the model
    joblib.dump(final_kmeans, "models/kmeans_model_files/kmeans_model.pkl")
    # Save the scaler
    joblib.dump(scaler, "models/kmeans_model_files/kmeans_scaler.pkl")
    # Save the feature matrix (optional)
    X.to_csv("models/kmeans_model_files/kmeans_features.csv", index=False)
    # Save the DataFrame with cluster labels
    df.to_csv("models/kmeans_model_files/movies_with_kmeans.csv", index=False)
    
    # Save best k value and score (metadata)
    with open("models/kmeans_model_files/kmeans_metadata.txt", "w") as f:
        f.write(f"Best k: {best_k}\nSilhouette Score: {best_score:.4f}")
    


In [9]:
# Train HDBScan model 
def train_hdbscan(df):
    # Prepare Feature Matrix X
    genre_cols = [col for col in df.columns if col.startswith('genre_')]
    director_cols = [col for col in df.columns if col.startswith('director_')]
    star_cols = [col for col in df.columns if col.startswith('star_')]
    numerical_cols = ['runtime', 'popularity', 'rating', 'votes']
    
    feature_cols = genre_cols + director_cols + star_cols + numerical_cols
    X = df[feature_cols].copy()
    
    # Normalize numeric features
    scaler = MinMaxScaler()
    X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.dropna(inplace=True)
    
    # Tune min_cluster_size via silhouette score
    print("Tuning HDBSCAN...")
    best_hdb_score = -1
    best_min_cluster = None
    best_hdb_labels = None
    best_hdb_model = None
    
    for min_cluster_size in range(10, 101, 10):
        hdb = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, prediction_data=True)
        labels = hdb.fit_predict(X)
    
        valid = labels != -1  # Ignore noise points
    
        if valid.sum() > 0:
            score = silhouette_score(X[valid], labels[valid])
            print(f"min_cluster_size = {min_cluster_size} → silhouette = {score:.4f}")
    
            if score > best_hdb_score:
                best_hdb_score = score
                best_min_cluster = min_cluster_size
                best_hdb_labels = labels
                best_hdb_model = hdb
    
    # Assign labels to df
    df['hdbscan_cluster'] = best_hdb_labels
    
    if best_min_cluster:
        print(f"Best HDBSCAN: min_cluster_size = {best_min_cluster} with silhouette = {best_hdb_score:.4f}")
    else:
        print("HDBSCAN couldn't find valid clusters. All are noise.")
    
    # Save model and results
    os.makedirs("models/hdbscan_model_files", exist_ok=True)
    # Save HDBSCAN model
    joblib.dump(best_hdb_model, "models/hdbscan_model_files/hdbscan_model.pkl")
    # Save feature matrix
    X.to_csv("models/hdbscan_model_files/hdbscan_features.csv", index=False)
    # Save cluster-labeled dataframe
    df.to_csv("models/hdbscan_model_files/movies_with_hdbscan.csv", index=False)
    # Save metadata
    with open("models/hdbscan_model_files/hdbscan_metadata.txt", "w") as f:
        f.write(f"Best min_cluster_size: {best_min_cluster}\nSilhouette Score: {best_hdb_score:.4f}")
    
    print("HDBSCAN model and data saved to 'saved_models/'")

In [10]:
# Main Function
def main():
    output_dir = 'models'
    plots_dir = 'plots'
    # Database query
    db_path = '../Data/movies.db'
    query = """SELECT
        l.movieid,
        i.movie_name,
        i.rating AS imdb_rating,
        i.votes AS imdb_votes,
        i.runtime AS imdb_runtime,
        i.year AS year,
        t.vote_average AS tmdb_vote_average,
        t.vote_count AS tmdb_votes,
        t.original_language as language,
        t.popularity as popularity,
        t.release_year,
        t.budget as budget,
        t.revenue as revenue,
        GROUP_CONCAT(DISTINCT g.genre_name) AS genres,
        GROUP_CONCAT(DISTINCT d.director_name) AS directors,
        GROUP_CONCAT(DISTINCT s.star_name) AS stars,
        GROUP_CONCAT(DISTINCT p.production_companies_name) AS production_house
    FROM links l
    JOIN imdb i ON l.imdbid = i.movie_id
    LEFT JOIN tmdb t ON l.tmdbid = t.id
    LEFT JOIN genre_imdb gi ON i.movie_id = gi.movie_id
    LEFT JOIN genre g ON gi.genre_id = g.genre_id
    LEFT JOIN director_imdb di ON i.movie_id = di.movie_id
    LEFT JOIN director d ON di.director_id = d.director_id
    LEFT JOIN star_imdb si ON i.movie_id = si.movie_id
    LEFT JOIN star s ON si.star_id = s.star_id
    LEFT JOIN production_companies_tmdb pi ON t.id = pi.id
    LEFT JOIN production_companies p ON pi.production_companies_id = p.production_companies_id 
    WHERE language = 'English'
    GROUP BY l.movieid
    """

    # Execute pipeline
    df = load_data(db_path, query)

    processed_df = preprocess(df)

    updated_df = feature_engineering(processed_df)

    train_knn(updated_df)

    train_kmean(updated_df)

    train_hdbscan(updated_df)

In [11]:
main()

2025-04-23 08:45:05,951 - INFO - Loading data from database...
2025-04-23 08:45:09,285 - INFO - Data loaded successfully with shape: (28508, 17)


Data cleaned successfully.
Remaining nulls:
 movieId        0
title          0
runtime        0
popularity     0
year           0
genres         0
directors      0
stars          0
productions    0
rating         0
votes          0
dtype: int64
(28403, 11)
Feature engineering complete. Final shape: (28403, 190)
🔍 Average Genre Overlap Score: 0.960
📏 Cosine Distances (lower is better): [0.06333983 0.12675562 0.13239012 0.15214444 0.15265836 0.15417086
 0.1595321  0.16557903 0.16878389 0.17228716 0.17332812 0.17339539
 0.17558436 0.1761561  0.17632592 0.17690186 0.17699316 0.17779729
 0.17968413 0.1800957  0.18057617 0.18090114 0.18180631 0.18193097
 0.18210249 0.18248988 0.18268445 0.18295089 0.18297554 0.1830694
 0.18331371 0.18357737 0.18367011 0.1837718  0.18394948 0.18395535
 0.184012   0.1842189  0.18435058 0.18449357 0.18449861 0.18461865
 0.18468832 0.18533345 0.18556607 0.18569331 0.18628593 0.18636269
 0.18652965 0.18657372]
All models and data saved to 'models/knn_model_files/