Project 3 Part 1 (Core)


Business Problem
For this project, you have been hired to produce a MySQL database on Movies from a subset of IMDB's publicly available dataset. Ultimately, you will use this database to analyze what makes a movie successful and will provide recommendations to the stakeholder on how to make a successful movie.

In [30]:
# IMPORT THE PACKAGE

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
!pip install tmdbsimple
import os, json, math, time
from tqdm.notebook import tqdm_notebook



In [31]:
# upload the files
basics = pd.read_csv('https://datasets.imdbws.com/title.basics.tsv.gz', 
                     sep='\t', low_memory=False)

In [32]:
ratings = pd.read_csv('https://datasets.imdbws.com/title.ratings.tsv.gz', 
                      sep='\t', low_memory=False)

In [33]:
akas = pd.read_csv('https://datasets.imdbws.com/title.akas.tsv.gz',
                   sep='\t', low_memory=False)

In [34]:
# replace the bad spelling missing values with Nan
basics = basics.replace({'\\N':np.nan}) 
ratings = ratings.replace({'\\N':np.nan}) 
akas = akas.replace({'\\N':np.nan}) 

In [35]:
# getting just the movies
basics = basics.loc[basics['titleType'] == 'movie']

In [36]:
# deliting the missing values
basics = basics.dropna(subset = ['runtimeMinutes', 'genres','startYear'])

In [37]:
# loc the dates the request from us 
basics['startYear'] = basics['startYear'].astype(int)
basics = basics.loc[(basics['startYear'] >= 2000) 
                    & (basics['startYear'] <= 2022)]

In [38]:
# taking out the documentary from our data
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [39]:
# checking the values
basics['genres'].unique()

array(['Comedy,Fantasy,Romance', 'Drama', 'Drama,War',
       'Comedy,Horror,Sci-Fi', 'Comedy', 'Comedy,Drama,Fantasy',
       'Drama,Romance', 'Comedy,Mystery', 'Drama,Fantasy', 'Adventure',
       'Musical,Romance', 'Action,Adventure,Drama', 'Action',
       'Crime,Thriller', 'Comedy,Fantasy', 'Action,Crime,Drama',
       'Action,Thriller', 'Comedy,Drama,Romance', 'Drama,Music,Romance',
       'Comedy,Horror,Mystery', 'Crime,Drama,Thriller', 'Comedy,Drama',
       'Action,Adventure,Animation', 'Comedy,Romance', 'Drama,Thriller',
       'Comedy,Drama,Sci-Fi', 'Adventure,Family,Fantasy', 'Drama,History',
       'Drama,History,War', 'Adventure,Animation,Comedy',
       'Action,Adventure,Fantasy', 'Action,Drama,Sci-Fi',
       'Biography,Drama,Romance', 'Horror,Mystery,Thriller',
       'Comedy,Drama,Thriller', 'Animation,Family,Musical',
       'Drama,Mystery,Thriller', 'Action,Adventure,Thriller',
       'Action,Horror,Sci-Fi', 'Action,Adventure,Sci-Fi',
       'Action,Adventure,Comedy

In [40]:
# just us movies
akas = akas.loc[akas['region'] == 'US']

In [41]:
keepers = basics['tconst'].isin(akas['titleId'])

In [42]:
basics= basics[keepers]

In [43]:
# loading to our file
import os
os.makedirs('movies/',exist_ok=True) 
# Confirm folder created
os.listdir("movies/")

['.ipynb_checkpoints',
 'akas.csv.gz',
 'basics.csv.gz',
 'final2001.csv.gz',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'ratings.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json',
 'tmdb_results_revenue.csv.gz']

In [44]:
# loading to our file
basics.to_csv('movies/basics.csv.gz',compression='gzip',index=False)
ratings.to_csv('movies/ratings.csv.gz',compression='gzip',index=False)
akas.to_csv('movies/akas.csv.gz',compression='gzip',index=False)

In [45]:
basics = pd.read_csv("movies/basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0094859,movie,Chief Zabu,Chief Zabu,0,2016,,74,Comedy


In [46]:
import json
with open('C:/Users/raged/.secret/tmbd_api.json') as f: #change the path to match YOUR path!!
    login = json.load(f)
login.keys()

dict_keys(['api-key'])

In [47]:
import tmdbsimple as tmbd
tmbd.API_KEY= login['api-key']

In [48]:
movie= tmbd.Movies(601)

info= movie.info()
info

{'adult': False,
 'backdrop_path': '/mXLVA0YL6tcXi6SJSuAh9ONXFj5.jpg',
 'belongs_to_collection': None,
 'budget': 10500000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 12, 'name': 'Adventure'},
  {'id': 10751, 'name': 'Family'},
  {'id': 14, 'name': 'Fantasy'}],
 'homepage': 'http://www.et20.com/',
 'id': 601,
 'imdb_id': 'tt0083866',
 'original_language': 'en',
 'original_title': 'E.T. the Extra-Terrestrial',
 'overview': 'After a gentle alien becomes stranded on Earth, the being is discovered and befriended by a young boy named Elliott. Bringing the extraterrestrial into his suburban California house, Elliott introduces E.T., as the alien is dubbed, to his brother and his little sister, Gertie, and the children decide to keep its existence a secret. Soon, however, E.T. falls ill, resulting in government intervention and a dire situation for both Elliott and the alien.',
 'popularity': 42.595,
 'poster_path': '/an0nD6uq6byfxXCfk6lQBzdL2J1.jpg',
 'production_companies'

In [49]:
import os 
folder= 'movies/'
os.makedirs(folder, exist_ok=True)
os.listdir(folder)

['.ipynb_checkpoints',
 'akas.csv.gz',
 'basics.csv.gz',
 'final2001.csv.gz',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'ratings.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json',
 'tmdb_results_revenue.csv.gz']

In [50]:
YEARS_TO_GET = [2000, 2001]
YEARS_TO_GET


[2000, 2001]

In [51]:
def funciton_movie(movie_id):
# Get the movie object for the current id
    movie = tmbd.Movies(movie_id)
    # save the .info .releases dictionaries
    info = movie.info()
    releases = movie.releases()
    # Loop through countries in releases
    for c in releases['countries']:
        # if the country abbreviation==US
        if c['iso_3166_1' ] =='US':
            ## save a "certification" key in the info dict with the certification
            info['certification'] = c['certification']
    return info


In [52]:
funciton_movie(601)

{'adult': False,
 'backdrop_path': '/mXLVA0YL6tcXi6SJSuAh9ONXFj5.jpg',
 'belongs_to_collection': None,
 'budget': 10500000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 12, 'name': 'Adventure'},
  {'id': 10751, 'name': 'Family'},
  {'id': 14, 'name': 'Fantasy'}],
 'homepage': 'http://www.et20.com/',
 'id': 601,
 'imdb_id': 'tt0083866',
 'original_language': 'en',
 'original_title': 'E.T. the Extra-Terrestrial',
 'overview': 'After a gentle alien becomes stranded on Earth, the being is discovered and befriended by a young boy named Elliott. Bringing the extraterrestrial into his suburban California house, Elliott introduces E.T., as the alien is dubbed, to his brother and his little sister, Gertie, and the children decide to keep its existence a secret. Soon, however, E.T. falls ill, resulting in government intervention and a dire situation for both Elliott and the alien.',
 'popularity': 42.595,
 'poster_path': '/an0nD6uq6byfxXCfk6lQBzdL2J1.jpg',
 'production_companies'

In [53]:
#USing a function to append new results to the existing JSON file

def write_json (new_data, filename):
    with open(filename, "r+") as file:
        #First we load existing data into a dict
        file_data = json.load(file)
        if (type(new_data)==list)& (type(file_data)==list):
            file_data.extend(new_data)# can use .append as alternative
        else:
            file_data.append(new_data)
        #setting the current files's position at offset.
        file.seek(0)
        #converting back to json
        json.dump(file_data, file)

In [54]:
#Loading the dataframe from project part 1 as basics
basics = pd.read_csv("movies/basics.csv.gz")
basics.head(5)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0094859,movie,Chief Zabu,Chief Zabu,0,2016,,74,Comedy


In [62]:
# OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
    # Define the JSON file to store results for current Year
    JSON_FILE = f'{folder}tmdb_api_results_{YEAR}.json'
    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)
    # If it does not exist: create it
    if file_exists == False:
    # Save an empty dict with just "imdb_id" to the new JSON file
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
               
    # Filter the IMDB title_basics file on current Year
    df = basics.loc[ basics['startYear']==YEAR].copy()
    
    # Save IMDB title_basics file movie ids to list
    movie_ids = df['tconst'].copy()
    
    # Create a dataframe from the JSON file
    previous_df = pd.read_json(JSON_FILE)
    
    # Filter out movie ids that are already exist in the JSON file
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

# INNER Loop
    # Get index and current movie id from list
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        # Attempt to retrieve the data for the movie id
        try:
            temp = funciton_movie(movie_id)
            # Append/extend results to JSON file function
            write_json(temp,JSON_FILE)
            # Pause 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        # If it fails, make a dict with just the id and None for certification.
        except Exception as e:
            
            continue
            
    # Save the year's results as csv.gz file
    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{folder}final_tmdb_data_{YEAR}.csv.gz", 
                         compression="gzip", index=False)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/205 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/241 [00:00<?, ?it/s]