<a href="https://colab.research.google.com/github/santiagob/popcorn-predictors/blob/data_analysis_sb/DataAnalysis_santiago_bernheim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Popcorn predictors

## Dataset analysis

This Notebook is a first approach to inspect the [TMDB Movie Dataset](https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata). The idea is to get some information about the dataset and have some insights before going ahead and building a ML model to predict movies based on content watched previously.

In [47]:
import pandas as pd

# This notebook assumes that both tmdb_5000_movies.csv and tmdb_5000_credits.csv are directly accessible in the workspace.
movie_csv = "tmdb_5000_movies.csv"
credits_csv = "tmdb_5000_credits.csv"
movies = pd.read_csv(movie_csv)
credits = pd.read_csv(credits_csv)

In [48]:
# Inspect both dataframes structures
print(movies.columns)
print(credits.columns)

print(movies.shape)
print(credits.shape)

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')
Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')
(4803, 20)
(4803, 4)


In [49]:
# Merge the datasets to work only with one.
# Both dataframe have the same id and title
rows_before_merge = movies.shape[0]
movies = movies.merge(credits, left_on='id', right_on='movie_id', suffixes=('', '_on_credits'))
print(movies.columns)
print(movies.shape)

if(movies.shape[0] == credits.shape[0] & movies.shape[0] == rows_before_merge ):
  print("Merged correctly, no movies dropped")

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'title_on_credits', 'cast', 'crew'],
      dtype='object')
(4803, 24)
Merged correctly, no movies dropped


In [55]:
# Lets drop the duplicated columns to keep only relevant columns.

try:
  if movies['id'].equals(movies['movie_id']):
      movies.drop('movie_id', axis=1, inplace=True)
except:
  pass

try:
  if movies['title_on_credits'].equals(movies['title']):
      movies.drop('title_on_credits', axis=1, inplace=True)
except:
  pass

try:
  if movies['original_title'].equals(movies['title']):
      movies.drop('original_title', axis=1, inplace=True)
except:
  pass




In [56]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [None]:
# Now lets drop the columns that seem not to be relevant for a person to like better a movie such as the homepage, status
movies.drop(['homepage', 'status', 'tagline', 'overview'], axis=1, inplace=True)


In [None]:
import ast  # Abstract Syntax Trees – used to safely evaluate stringified Python objects (like list of dicts)

# Function to extract the 'name' field from stringified list of dictionaries (e.g., genres, keywords, cast)
def convert(obj):
    try:
        L = []
        for i in ast.literal_eval(obj):  # Convert string to list of dicts
            L.append(i['name'])          # Extract only the 'name' value
        return L
    except:
        return []  # Return empty list if there's an error (e.g., malformed JSON)

# Function to extract the director's name from the crew list
def get_director(obj):
    try:
        L = []
        for i in ast.literal_eval(obj):  # Again, convert string to list of dicts
            if i['job'] == 'Director':   # Check if the job is Director
                L.append(i['name'])
        return L
    except:
        return []

# Apply the conversion to 'genres' column
movies['genres'] = movies['genres'].apply(convert)

# Apply the conversion to 'keywords' column
movies['keywords'] = movies['keywords'].apply(convert)

# Apply conversion to 'cast' and limit to top 3 actors
movies['cast'] = movies['cast'].apply(lambda x: convert(x)[:3])  # Keeps top 3 important cast members

# Apply the function to extract director from crew
movies['crew'] = movies['crew'].apply(get_director)
