In [3]:
import numpy as np 
import os
import pandas as pd
from ast import literal_eval

def get_csv(load_filename, data_dir=None):
    if data_dir is None:
        data_dir = os.getcwd()
    for dirname, _, filenames in os.walk(data_dir):
        filepath = None
        if load_filename in filenames:
            filepath = os.path.join(dirname, load_filename)
            return pd.read_csv(filepath)
    print(f"No such file {load_filename}")
    return None

In [4]:
os.mkdir("movies_data")

In [25]:
movies = get_csv("movies_metadata.csv")
movies = movies[movies.id.apply(
    lambda x: x.isnumeric()
)] # keeps only numeric ids
movies.dropna(subset=["original_language",
                      "original_title",
                      "overview",
                      "release_date",
                      "revenue",
                      "runtime",
                      "title",
                      "vote_average",
                      "vote_count"], 
              inplace=True) # drop na values from features with only a few nas (<0.01%)
# remove odd few duplicate ids
movies.drop_duplicates(subset=["id"], inplace=True)
movies.drop(["adult", 
             "belongs_to_collection", 
             "homepage", 
             "imdb_id", 
             "popularity",  
             "poster_path",  
             "status", 
             "production_companies", 
             "production_countries",  
             "spoken_languages",  
             "video",
             "vote_average",
             "vote_count"], 
            axis=1, 
            inplace=True) # drop boring/nonsense features that complicate process
features = ['id', 'title', 'original_title', 'runtime', 'budget', 
            'genres', 'original_language',  'overview', 'release_date', 
            'revenue', 'tagline']
movies[features].to_csv("movies_data/movies.csv", index=False)

In [21]:
movies[features].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44395 entries, 0 to 45465
Data columns (total 13 columns):
id                   44395 non-null object
title                44395 non-null object
original_title       44395 non-null object
runtime              44395 non-null float64
budget               44395 non-null object
genres               44395 non-null object
original_language    44395 non-null object
overview             44395 non-null object
release_date         44395 non-null object
revenue              44395 non-null float64
tagline              20379 non-null object
vote_average         44395 non-null float64
vote_count           44395 non-null float64
dtypes: float64(4), object(9)
memory usage: 4.7+ MB


In [10]:
movies.columns

Index(['budget', 'genres', 'id', 'original_language', 'original_title',
       'overview', 'release_date', 'revenue', 'runtime', 'tagline', 'title',
       'vote_average', 'vote_count'],
      dtype='object')

In [16]:
ratings = get_csv("ratings.csv", "3405_6663_bundle_archive")       
links = get_csv("links.csv", "3405_6663_bundle_archive")

# ratings is only table to use movieId - all other tables use tmbdid / imdb
# liks table required to match movieId to tmdb / imdb ids
# merge tmdbid into ratings table so links can be dropped

ratings = ratings.merge(links[["movieId","tmdbId"]],
                        on = "movieId",
                        how = "left")
ratings.dropna(subset=["tmdbId"], inplace=True)
ratings = (ratings.drop("movieId", axis=1)
           .rename(columns={"tmdbId":"movie_id", 
                            "userId":"user_id"})
)
movie_ids = movies.id.values
ratings = ratings[ratings.movie_id.isin(movie_ids)]
# 125 users submit > 1 reviews for same movie - get rid of these
ratings.drop_duplicates(subset=["user_id", "movie_id"],
                        inplace=True)
ratings.to_csv("movies_data/ratings.csv", index=False)

In [22]:
ratings.rating.value_counts()

4.0    6987136
3.0    5250214
5.0    3802850
3.5    3110389
4.5    2164785
2.0    1760011
2.5    1253195
1.0     841787
0.5     403511
1.5     402876
Name: rating, dtype: int64

In [18]:
credits = get_csv("credits.csv")
credits = (credits[["id", "cast"]]
           .rename(columns={"id":"movie_id", 
                            "cast":"credits"}))
credits.to_csv("movies_data/credits.csv", index=False)