# Rule-Based Movie Recommendation System

This notebook implements a **simple rule-based recommender** that scores movies based on:
- Genre similarity
- Director match
- Shared lead actors
- Similar release years

The user "likes" a small set of movies, and the system scores all other movies based on how similar they are to the liked ones.


In [33]:
import pandas as pd

In [34]:
# Load IMDb top 1000 dataset
movies = pd.read_csv("../Data/imdb_top_1000.csv")
movies.head()


Unnamed: 0,Poster_Link,Title,Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [35]:
# Convert year and split genre
movies['Year'] = movies['Year'].astype(int)
genre_split = movies['Genre'].str.split(', ', expand=True)
genre_split.columns = ['Genre1', 'Genre2', 'Genre3']
movies = pd.concat([movies, genre_split], axis=1)

# Drop unused columns
movies.drop(['Poster_Link', 'IMDB_Rating', 'Certificate', 'Overview', 'Meta_score', 'No_of_Votes', 'Gross', 'Genre'], axis=1, inplace=True)

# Rename columns
movies.columns = ['Title', 'Year', 'Length', 'Director', 'Star1', 'Star2', 'Star3', 'Star4', 'Genre1', 'Genre2', 'Genre3']
movies.head()

Unnamed: 0,Title,Year,Length,Director,Star1,Star2,Star3,Star4,Genre1,Genre2,Genre3
0,The Shawshank Redemption,1994,142 min,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,Drama,,
1,The Godfather,1972,175 min,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,Crime,Drama,
2,The Dark Knight,2008,152 min,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,Action,Crime,Drama
3,The Godfather: Part II,1974,202 min,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,Crime,Drama,
4,12 Angry Men,1957,96 min,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,Crime,Drama,


In [36]:
# Creating a dataframe of top 100 rated movies which will be used to select the users liked movies
movies_top_100 = movies.iloc[0:200, :]
movies_top_100 = movies_top_100.sample(frac=1).reset_index(drop=True)
movie_names = movies_top_100['Title'].tolist()

choices1 = movie_names[0:10]
choices2 = movie_names[10:20]
choices3 = movie_names[20:30]
choices4 = movie_names[30:40]
choices5 = movie_names[40:50]
choices6 = movie_names[50:60]
choices7 = movie_names[60:70]
choices8 = movie_names[70:80]
choices9 = movie_names[80:90]
choices10 = movie_names[90:100]


In [None]:
# Here we simulate 10 liked movies from top 100. Normally we'd get these via some form of input
liked_movies = movies.iloc[0:10]
movies = movies[~movies['Title'].isin(liked_movies['Title'])]

In [38]:
def compare_years(y1, y2):
    if abs(y1 - y2) < 5:
        return 4
    elif abs(y1 - y2) < 10:
        return 2
    return 0

def compare_directors(d1, d2):
    return 6 if d1 == d2 else 0

def compare_genres(g1, g2, g3, liked_g1, liked_g2, liked_g3):
    return sum(g in [liked_g1, liked_g2, liked_g3] for g in [g1, g2, g3]) * 3

def compare_actors(a1, a2, a3, a4, liked_actors):
    return sum(a in liked_actors for a in [a1, a2, a3, a4]) * 2


In [39]:
movies['Score'] = 0

for idx, row in movies.iterrows():
    year_score = liked_movies['Year'].apply(lambda y: compare_years(row['Year'], y)).sum()

    director_score = liked_movies['Director'].apply(lambda d: compare_directors(row['Director'], d)).sum()

    genre_score = liked_movies.apply(
        lambda r: compare_genres(row['Genre1'], row['Genre2'], row['Genre3'], r['Genre1'], r['Genre2'], r['Genre3']),
        axis=1
    ).sum()

    liked_actors = liked_movies[['Star1', 'Star2', 'Star3', 'Star4']].values.flatten()

    actor_score = compare_actors(row['Star1'], row['Star2'], row['Star3'], row['Star4'], liked_actors)

    total_score = year_score + director_score + genre_score + actor_score
    
    movies.at[idx, 'Score'] = total_score


In [40]:
# Show top 10 recommended movies
top_recommendations = movies.sort_values(by='Score', ascending=False).head(10)
top_recommendations[['Title', 'Score']]


Unnamed: 0,Title,Score
973,The Godfather: Part III,90
941,25th Hour,83
40,American History X,81
656,Breaking the Waves,81
517,Boogie Nights,81
946,Y tu mamá también,81
785,The Magdalene Sisters,81
787,In America,81
788,I Am Sam,81
812,Hamlet,81
