In [None]:
import pandas as pd 
import numpy as np 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

In [None]:

# Load dataset
df = pd.read_csv("imdb_top_1000.csv")
df.head()

In [None]:

df.shape

In [None]:

df['content'] = df['Genre'] + " " + df["Overview"] + " " + df['Director'] + " " + df['Star1'] +  " " + df['Star2'] +  " " + df['Star2'] + " " + df['Star4']
 
df['content'] = df['content'].str.lower()

df.head()

In [None]:

# TF-IDF for Overview text
tfidf = TfidfVectorizer(stop_words='english', max_features=300)
overview_tfidf = tfidf.fit_transform(df['content'])

# Combine numerical and text features
features = np.hstack([
    df[['IMDB_Rating']].values,
    overview_tfidf.toarray()
    
])


features

In [None]:

# Scale data
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Dimensionality reduction
pca = PCA(n_components=50)
features_reduced = pca.fit_transform(features_scaled)

features_reduced.shape

In [None]:

# DBSCAN clustering
dbscan = DBSCAN(eps=10, min_samples=10, metric='euclidean')
df['Cluster'] = dbscan.fit_predict(features_reduced)

df['Cluster'].value_counts()

In [None]:

def recommend_movies(title, df, num_recommendations=6):
    if title not in df['Series_Title'].values:
        return "Movie not found in dataset."

    cluster_label = df[df['Series_Title'] == title]['Cluster'].values[0]


    cluster_movies = df[df['Cluster'] == cluster_label]
    
    movie_vector = overview_tfidf[df[df['Series_Title'] == title].index[0]]
    similarities = cosine_similarity(movie_vector, overview_tfidf[cluster_movies.index]).flatten()
    
    similar_indices = similarities.argsort()[-(num_recommendations + 1):-1][::-1]
    
    # Return selected columns in a DataFrame
    recommendations = cluster_movies.iloc[similar_indices][['Series_Title', 'Overview', 'IMDB_Rating', 'Poster_Link']]

    return recommendations.reset_index(drop=True)

# Example usage
recommend_movies("The Shawshank Redemption", df)

In [None]:

# Example usage
recommend_movies("Network", df)

In [None]:
df.to_csv("clustered_df.csv",index=False)

import pickle 

pickle.dump(overview_tfidf, open("overview_tfidf.pkl",'wb'))