In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load dataset
df = pd.read_csv("imdb_top_1000.csv")
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [3]:
df['content'] = df['Genre'] + " " + df["Overview"] + " " + df['Director'] + " " + df['Star1'] +  " " + df['Star2'] +  " " + df['Star2'] + " " + df['Star4']
 
df['content'] = df['content'].str.lower()

df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross,content
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469,drama two imprisoned men bond over a number of...
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411,"crime, drama an organized crime dynasty's agin..."
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444,"action, crime, drama when the menace known as ..."
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000,"crime, drama the early life and career of vito..."
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000,"crime, drama a jury holdout attempts to preven..."


In [4]:
# TF-IDF for Overview text
tfidf = TfidfVectorizer(stop_words='english', max_features=300)
overview_tfidf = tfidf.fit_transform(df['content'])

# Combine numerical and text features
features = np.hstack([
    df[['IMDB_Rating']].values,
    overview_tfidf.toarray()
    
])


features

array([[9.3       , 0.        , 0.        , ..., 0.35435667, 0.        ,
        0.        ],
       [9.2       , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [9.        , 0.        , 0.21353192, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [7.6       , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [7.6       , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [7.6       , 0.26828371, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [5]:
# Scale data
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Dimensionality reduction
pca = PCA(n_components=50)
features_reduced = pca.fit_transform(features_scaled)

In [6]:
# DBSCAN clustering
dbscan = DBSCAN(eps=10, min_samples=10, metric='euclidean')
df['Cluster'] = dbscan.fit_predict(features_reduced)

df['Cluster'].value_counts()

Cluster
 0    899
-1     83
 2     10
 1      8
Name: count, dtype: int64

In [7]:
def recommend_movies(title, df, num_recommendations=5):
    if title not in df['Series_Title'].values:
        return "Movie not found in dataset."

    cluster_label = df[df['Series_Title'] == title]['Cluster'].values[0]


    cluster_movies = df[df['Cluster'] == cluster_label]
    movie_vector = overview_tfidf[df[df['Series_Title'] == title].index[0]]
    similarities = cosine_similarity(movie_vector, overview_tfidf[cluster_movies.index]).flatten()
    
    similar_indices = similarities.argsort()[-(num_recommendations + 1):-1][::-1]
    
    # Return selected columns in a DataFrame
    recommendations = cluster_movies.iloc[similar_indices][['Series_Title', 'Overview', 'IMDB_Rating', 'Poster_Link']]

    return recommendations.reset_index(drop=True)

# Example usage
recommend_movies("The Shawshank Redemption", df)

Unnamed: 0,Series_Title,Overview,IMDB_Rating,Poster_Link
0,Mystic River,The lives of three men who were childhood frie...,7.9,https://m.media-amazon.com/images/M/MV5BMTIzND...
1,Network,A television network cynically exploits a dera...,8.1,https://m.media-amazon.com/images/M/MV5BZGNjYj...
2,Fa yeung nin wah,"Two neighbors, a woman and a man, form a stron...",8.1,https://m.media-amazon.com/images/M/MV5BYjZjOD...
3,Reservoir Dogs,When a simple jewelry heist goes horribly wron...,8.3,https://m.media-amazon.com/images/M/MV5BZmExNm...
4,Funny Games,"Two violent young men take a mother, father, a...",7.6,https://m.media-amazon.com/images/M/MV5BMTkyNT...


In [9]:
recommend_movies("Judgment at Nuremberg",df)

Unnamed: 0,Series_Title,Overview,IMDB_Rating,Poster_Link
0,Full Metal Jacket,A pragmatic U.S. Marine observes the dehumaniz...,8.3,https://m.media-amazon.com/images/M/MV5BNzkxOD...
1,Underground,A group of Serbian socialists prepares for the...,8.1,https://m.media-amazon.com/images/M/MV5BNzI4YT...
2,Beasts of No Nation,"A drama based on the experiences of Agu, a chi...",7.7,https://m.media-amazon.com/images/M/MV5BMTYwMz...
3,Paths of Glory,"After refusing to attack an enemy position, a ...",8.4,https://m.media-amazon.com/images/M/MV5BNjViMm...
4,Guess Who's Coming to Dinner,A couple's attitudes are challenged when their...,7.8,https://m.media-amazon.com/images/M/MV5BZTVmMT...


In [10]:
df.to_csv("clustered_df.csv",index=False)


import pickle
pickle.dump(overview_tfidf,open("overview_tfidf.pkl",'wb'))