In [1]:
import pandas as pd

movie_csv = "tmdb_5000_movies.csv"
credits_csv = "tmdb_5000_credits.csv"
movies = pd.read_csv(movie_csv)
credits = pd.read_csv(credits_csv)

#Loaded and read datasets


FileNotFoundError: [Errno 2] No such file or directory: 'tmdb_5000_movies.csv'

In [None]:
#Merge the datasets
movies = movies.merge(credits, left_on='id', right_on='movie_id')

print(movies.columns)

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average',
       'vote_count', 'movie_id_x', 'title_y', 'cast_x', 'crew_x', 'movie_id_y',
       'title', 'cast_y', 'crew_y'],
      dtype='object')


In [None]:
movies = movies[['id', 'title_x', 'overview', 'genres', 'keywords', 'cast_y', 'crew_y']]
movies.rename(columns={
    'title_x': 'title',
    'cast_y': 'cast',
    'crew_y': 'crew',
    'id': 'movie_id'
}, inplace=True)

movies.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies.rename(columns={


Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [None]:
import ast  # Abstract Syntax Trees – used to safely evaluate stringified Python objects (like list of dicts)

# Function to extract the 'name' field from stringified list of dictionaries (e.g., genres, keywords, cast)
def convert(obj):
    try:
        L = []
        for i in ast.literal_eval(obj):  # Convert string to list of dicts
            L.append(i['name'])          # Extract only the 'name' value
        return L
    except:
        return []  # Return empty list if there's an error (e.g., malformed JSON)

# Function to extract the director's name from the crew list
def get_director(obj):
    try:
        L = []
        for i in ast.literal_eval(obj):  # Again, convert string to list of dicts
            if i['job'] == 'Director':   # Check if the job is Director
                L.append(i['name'])
        return L
    except:
        return []

# Apply the conversion to 'genres' column
movies['genres'] = movies['genres'].apply(convert)

# Apply the conversion to 'keywords' column
movies['keywords'] = movies['keywords'].apply(convert)

# Apply conversion to 'cast' and limit to top 3 actors
movies['cast'] = movies['cast'].apply(lambda x: convert(x)[:3])  # Keeps top 3 important cast members

# Apply the function to extract director from crew
movies['crew'] = movies['crew'].apply(get_director)


In [None]:
#Create tags column
# Combine genres, keywords, cast, crew, and overview into a single string
def collapse(lst):
    return " ".join(lst)

# Fill missing overviews with empty strings
movies['overview'] = movies['overview'].fillna("")

# Convert the lists into space-separated strings
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)

# Combine everything into one column: 'tags'
movies['tags'] = movies['overview'] + " " + movies['genres'] + " " + movies['keywords'] + " " + movies['cast'] + " " + movies['crew']

# Keep only relevant columns for modeling
new_df = movies[['movie_id', 'title', 'tags']]
new_df.head()


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [None]:
#Preprocessing and Vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Convert all tags to lowercase and remove spaces in multi-word strings
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower().replace(" ", ""))

# Initialize CountVectorizer to keep top 5000 words and remove English stop words
cv = CountVectorizer(max_features=5000, stop_words='english')

# Fit and transform the 'tags' column
vectors = cv.fit_transform(new_df['tags']).toarray()

# Compute cosine similarity between all movie vectors
similarity = cosine_similarity(vectors)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower().replace(" ", ""))


In [None]:
# Function to recommend similar movies
def recommend(movie):
    movie = movie.lower()

    # Check if the movie is in our dataset
    if movie not in new_df['title'].str.lower().values:
        return " Movie not found in the dataset."

    # Get index of the input movie
    index = new_df[new_df['title'].str.lower() == movie].index[0]

    # Get similarity scores for all movies
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])

    # Print top 5 most similar movies (excluding the original one)
    print(f" Top 5 recommendations for '{new_df.iloc[index].title}':")
    for i in distances[1:6]:
        print(new_df.iloc[i[0]].title)


In [None]:
recommend("Inception")


🎬 Top 5 recommendations for 'Inception':
Avatar
Pirates of the Caribbean: At World's End
Spectre
The Dark Knight Rises
John Carter


In [None]:
#Week 3-4 Demographic filtering

# Check actual column names in case they changed
print(movies.columns)


Index(['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew',
       'tags'],
      dtype='object')


In [None]:
# Load original movies CSV to get vote data
votes = pd.read_csv("tmdb_5000_movies.csv")

# Merge vote info back into your current 'movies' DataFrame using 'movie_id'
movies = movies.merge(votes[['id', 'vote_count', 'vote_average']], left_on='movie_id', right_on='id')


In [None]:
# Calculate C (mean rating) and m (vote count threshold)
C = movies['vote_average'].mean()
m = movies['vote_count'].quantile(0.90)

# Filter movies with enough votes
qualified = movies[movies['vote_count'] >= m].copy()

# Weighted rating formula
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v / (v + m) * R) + (m / (m + v) * C)

# Apply the scoring function
qualified['score'] = qualified.apply(weighted_rating, axis=1)

# Top 10 most popular movies
qualified.sort_values('score', ascending=False)[['title', 'vote_count', 'vote_average', 'score']].head(10)


Unnamed: 0,title,vote_count,vote_average,score
1881,The Shawshank Redemption,8205,8.5,8.059258
662,Fight Club,9413,8.3,7.939256
65,The Dark Knight,12002,8.2,7.92002
3232,Pulp Fiction,8428,8.3,7.904645
96,Inception,13752,8.1,7.863239
3337,The Godfather,5893,8.4,7.851236
95,Interstellar,10867,8.1,7.809479
809,Forrest Gump,7927,8.2,7.803188
329,The Lord of the Rings: The Return of the King,8064,8.1,7.727243
1990,The Empire Strikes Back,5879,8.2,7.697884
