In [None]:
# prompt: connect to google drive

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Personalized Movie Recommender Files/Preprocessed_Movies_IMDb.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ID            4999 non-null   int64  
 1   Movie Name    4999 non-null   object 
 2   Rating        4999 non-null   float64
 3   Runtime       4999 non-null   float64
 4   Genre         4999 non-null   object 
 5   Metascore     3786 non-null   float64
 6   Plot          4999 non-null   object 
 7   Directors     4999 non-null   object 
 8   Stars         4999 non-null   object 
 9   Votes         4999 non-null   float64
 10  Gross         4999 non-null   float64
 11  Link          4999 non-null   object 
 12  Cleaned_Plot  4999 non-null   object 
dtypes: float64(5), int64(1), object(7)
memory usage: 507.8+ KB


In [None]:
df.head()

Unnamed: 0,ID,Movie Name,Rating,Runtime,Genre,Metascore,Plot,Directors,Stars,Votes,Gross,Link,Cleaned_Plot
0,1,The Shawshank Redemption,1.0,142.0,Drama,82.0,"Over the course of several years, two convicts...","['Frank Darabont', 'Tim Robbins', 'Morgan Free...","['Tim Robbins', 'Morgan Freeman', 'Bob Gunton'...",1.0,0.030258,https://www.imdb.com/title/tt0111161/,course several year two convict form friendshi...
1,2,The Godfather,0.973684,175.0,"Crime, Drama",100.0,"Don Vito Corleone, head of a mafia family, dec...","['Francis Ford Coppola', 'Marlon Brando', 'Al ...","['Marlon Brando', 'Al Pacino', 'James Caan', '...",0.694551,0.144093,https://www.imdb.com/title/tt0068646/,vito corleone head mafia family decides hand e...
2,3,Ramayana: The Legend of Prince Rama,0.973684,135.0,"Animation, Action, Adventure",,An anime adaptation of the Hindu epic the Rama...,"['Ram Mohan', 'Yûgô Sakô', 'Koichi Saski', 'Ar...","['Yûgô Sakô', 'Koichi Saski', 'Arun Govil', 'N...",3.7e-05,1.1e-05,https://www.imdb.com/title/tt0259534/,anime adaptation hindu epic ramayana lord ram ...
3,4,The Chaos Class,0.973684,87.0,"Comedy, Drama",,"Lazy, uneducated students share a very close b...","['Ertem Egilmez', 'Kemal Sunal', 'Münir Özkul'...","['Kemal Sunal', 'Münir Özkul', 'Halit Akçatepe...",0.011588,4.5e-05,https://www.imdb.com/title/tt0252487/,lazy uneducated student share close bond live ...
4,5,Daman,0.947368,121.0,"Adventure, Drama",,"The film is set in 2015. Sid, is a young docto...","['Lenka Debiprasad', 'Vishal Mourya', 'Karan K...","['Vishal Mourya', 'Karan Kandhapan', 'Babushan...",0.001202,1.4e-05,https://www.imdb.com/title/tt17592606/,film set sid young doctor completed mbbs poste...


In [None]:
df.dtypes

Unnamed: 0,0
ID,int64
Movie Name,object
Rating,float64
Runtime,float64
Genre,object
Metascore,float64
Plot,object
Directors,object
Stars,object
Votes,float64


In [None]:
df.isnull().sum()

Unnamed: 0,0
ID,0
Movie Name,0
Rating,0
Runtime,0
Genre,0
Metascore,1213
Plot,0
Directors,0
Stars,0
Votes,0


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#Recommender System
df['Genre'] = df['Genre'].fillna('')

tfidf = TfidfVectorizer(stop_words='english')
matrix = tfidf.fit_transform(df['Genre'])

cosine_sim = cosine_similarity(matrix, matrix)

movie_indx = pd.Series(df.index, index=df['Movie Name']).drop_duplicates()

def recommend_movies(movie_name, num_recs=5):
    if movie_name not in movie_indx:
        return f"Movie '{movie_name}' not found in the dataset."

    indx = movie_indx[movie_name]

    sim_scores = list(enumerate(cosine_sim[indx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    top_movies_indx = [i[0] for i in sim_scores[1:num_recs+1]]

    return df['Movie Name'].iloc[top_movies_indx].tolist()

# Test the system
searched_movie = 'Mirror Game'
recs = recommend_movies(searched_movie, num_recs=5)
print(f"Recommendations for '{searched_movie}': {recs}")


Recommendations for 'Mirror Game': ['M', 'Andhadhun', 'The Testament of Dr. Mabuse', 'The 39 Steps', 'Following']


In [None]:
from sklearn.cluster import KMeans

# Step 1: Vectorize the Genre column
tfidf = TfidfVectorizer(stop_words='english')
matrix = tfidf.fit_transform(df['Genre'])

# Step 2: Apply KMeans clustering
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(matrix)

# Step 3: Define a function to recommend movies from the same cluster
def recommend_movies_clustering(movie_name, num_recs=5):
    if movie_name not in movie_indx:
        return f"Movie '{movie_name}' not found in the dataset."

    cluster_label = df.loc[movie_indx[movie_name], 'Cluster']
    cluster_movies = df[df['Cluster'] == cluster_label]['Movie Name']

    recommendations = cluster_movies[cluster_movies != movie_name].head(num_recs).tolist()
    return recommendations

# Example
searched_movie = 'Mirror Game'
recs = recommend_movies_clustering(searched_movie, num_recs=5)
print(f"Recommendations for '{searched_movie}': {recs}")


Recommendations for 'Mirror Game': ['Kill Bill: The Whole Bloody Affair', 'The Silence of the Lambs', 'The Departed', 'Parasite', 'Woman in the Dunes']


In [None]:
#Evaluation with different numbers of cluster

num_clusters = 20  # Adjust this to try different results
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(matrix)

def recommend_movies_clustering(movie_name, num_recs=5):
    if movie_name not in movie_indx:
        return f"Movie '{movie_name}' not found in the dataset."

    cluster_label = df.loc[movie_indx[movie_name], 'Cluster']
    cluster_movies = df[df['Cluster'] == cluster_label]['Movie Name']

    recommendations = cluster_movies[cluster_movies != movie_name].head(num_recs).tolist()
    return recommendations

searched_movie = 'Mirror Game'
recs = recommend_movies_clustering(searched_movie, num_recs=5)
print(f"Recommendations for '{searched_movie}': {recs}")

Recommendations for 'Mirror Game': ['Harakiri', 'Sita Ramam', 'Once Upon a Time in the West', 'Rear Window', 'Oldboy']


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Step 1: Process Text Features
# Vectorize the 'Genre' column
tfidf = TfidfVectorizer(stop_words='english', max_features=100)  # Limit features to manage dimensionality
genre_vectors = tfidf.fit_transform(df['Genre']).toarray()

# Step 2: Process Other Features
# Example: Label encode 'Year' (or other categorical columns if applicable)
if 'Year' in df.columns:
    df['Year'] = df['Year'].fillna(-1)  # Replace NaN with -1 for missing values
    le = LabelEncoder()
    df['Year_Encoded'] = le.fit_transform(df['Year'])

# Step 3: Combine Features
X = np.hstack([genre_vectors])  # Add other features if required, e.g., df['Year_Encoded'].values[:, None]
y = cosine_similarity(genre_vectors).mean(axis=1)  # Example target: Mean similarity scores

# Ensure all data is numerical
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train the Model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Step 5: Recommend Movies
def recommend_movies_regression(movie_name, num_recs=5):
    if movie_name not in movie_indx:
        return f"Movie '{movie_name}' not found in the dataset."

    # Get vector for the target movie
    indx = movie_indx[movie_name]
    movie_vector = X[indx].reshape(1, -1)
    scores = model.predict(X)

    # Sort movies by predicted scores
    df['Predicted_Score'] = scores
    recommendations = df.sort_values(by='Predicted_Score', ascending=False)['Movie Name']
    return recommendations[recommendations != movie_name].head(num_recs).tolist()

# Example Usage
searched_movie = 'Mirror Game'
recs = recommend_movies_regression(searched_movie, num_recs=5)
print(f"Recommendations for '{searched_movie}': {recs}")


Recommendations for 'Mirror Game': ['Monica, O My Darling', 'Suicide Kings', "Adam's Apples", 'Man Bites Dog', 'Blindspotting']


In [None]:
from sklearn.cluster import KMeans

# Step 1: Vectorize the Genre column
tfidf = TfidfVectorizer(stop_words='english')
matrix = tfidf.fit_transform(df['Genre'])

# Step 2: Apply KMeans clustering
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(matrix)

# Step 3: Define a function to recommend movies from the same cluster
def recommend_movies_clustering(movie_name, num_recs=5):
    if movie_name not in movie_indx:
        return f"Movie '{movie_name}' not found in the dataset."

    cluster_label = df.loc[movie_indx[movie_name], 'Cluster']
    cluster_movies = df[df['Cluster'] == cluster_label]['Movie Name']

    recommendations = cluster_movies[cluster_movies != movie_name].head(num_recs).tolist()
    return recommendations

# Example
searched_movie = 'Inception'
recs = recommend_movies_clustering(searched_movie, num_recs=5)
print(f"Recommendations for '{searched_movie}': {recs}")


Recommendations for 'Inception': ['Interstellar', 'The Matrix', 'Terminator 2: Judgment Day', 'Back to the Future', 'The Prestige']


In [None]:
df_dirty = pd.read_csv('/content/drive/MyDrive/Personalized Movie Recommender Files/Top_5000_Movies_IMDb.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               4999 non-null   int64  
 1   Movie Name       4999 non-null   object 
 2   Rating           4999 non-null   float64
 3   Runtime          4999 non-null   float64
 4   Genre            4999 non-null   object 
 5   Metascore        3786 non-null   float64
 6   Plot             4999 non-null   object 
 7   Directors        4999 non-null   object 
 8   Stars            4999 non-null   object 
 9   Votes            4999 non-null   float64
 10  Gross            4999 non-null   float64
 11  Link             4999 non-null   object 
 12  Cleaned_Plot     4999 non-null   object 
 13  Cluster          4999 non-null   int32  
 14  Predicted_Score  4999 non-null   float64
dtypes: float64(6), int32(1), int64(1), object(7)
memory usage: 566.4+ KB


In [None]:
df.isnull().sum()

Unnamed: 0,0
ID,0
Movie Name,0
Rating,0
Runtime,0
Genre,0
Metascore,1213
Plot,0
Directors,0
Stars,0
Votes,0
