In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
#main media name dataset
tbasics_file = "/kaggle/input/imdb-basic-dataset/title.basics.tsv/data.tsv"
# TV show link table
eposode_file = "/kaggle/input/imdb-basic-dataset/title.episode.tsv/data.tsv"
# Big file, a linking dataset for film workers/acters to videos.  also lists jobs, category
principals_file = "/kaggle/input/imdb-basic-dataset/title.principals.tsv/data.tsv"
# ratings
ratings_file = "/kaggle/input/imdb-basic-dataset/title.ratings.tsv/data.tsv"
# Actors, Workers, people in credits
nbasics_file = "/kaggle/input/imdb-basic-dataset/name.basics.tsv/data.tsv"
# very big file; alternative names for movies. 
#  some Japanese or Chinese moves have different titles in different regions
#  ie - Shogun Assassian (US) and Lone Wolf and Cub (JP)
akas_file = "/kaggle/input/imdb-basic-dataset/title.akas.tsv/data.tsv"
# link dataset for media to directors and writers
crew_file = "/kaggle/input/imdb-basic-dataset/title.crew.tsv/data.tsv"

# Load the main Media data set
this Dataset will contain all the videos that can be found on IMDB.  It is not limited to Movies and TV shows, you can also find fan films and podcasts too.  I enabled ```low_memory=False``` to suppress errors when loading the data.

In [3]:
basics = pd.read_csv(tbasics_file, sep='\t', low_memory=False)


# Load ratings dataset
First I load the ratings dataset and join it with the Media dataset (basics).  Now every Media that is listed has a rating and votes listed in each row.

In [4]:
ratings = pd.read_csv(ratings_file, sep='\t')

tbasics = basics.join(ratings.set_index('tconst'), on="tconst")

# Lets view the updated Dataframe
The data now has the average rating and number of votes

In [5]:
tbasics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,1980.0
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",5.8,265.0
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance",6.5,1835.0
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short",5.6,179.0
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short",6.2,2624.0


# What is the break down of Media Type
as you will see bellow, their are a few different enteries with tv as a prefix. tvPilot only has 1.

In [6]:
tbasics.groupby(['titleType']).titleType.count()

titleType
movie            648082
short            935682
tvEpisode       7546405
tvMiniSeries      48929
tvMovie          142038
tvPilot               1
tvSeries         244950
tvShort            9968
tvSpecial         41988
video            275322
videoGame         34883
Name: titleType, dtype: int64

# Prepair the Data for TV series

I use pandas to shape and transform my data for TV Shows into something a little more helpful. 2 new dataframes will need to be created, one for tvSeries and one for tvEpisode. 

- the episode dataset will link the 2 together
- column names of both sets will need to be renamed
- irrelivent columns will need to be dropped

In [7]:
tvSeries = tbasics.loc[(tbasics['titleType'] == "tvSeries")].rename(columns={"tconst":"parentTconst", "primaryTitle":"TVShow"})

In [8]:
tvEpisode = tbasics.loc[(tbasics['titleType'] == "tvEpisode")].rename(columns={"primaryTitle": "eposodeTitle", "originalTitle": "originalEposode", "startYear": "year", "runtimeMinutes":"minutes"})

In [9]:
eposodes = pd.read_csv(eposode_file, sep='\t').rename(columns={"seasonNumber": "S", "episodeNumber": "E"})

# Join the 3 DataFrames
I'm using a one liner to make one dataframe with all the relevant data for TV shows

In [10]:
TV = eposodes.join(tvSeries.drop([
        'isAdult','runtimeMinutes','genres','titleType','averageRating', 'numVotes'
    ], axis=1).set_index('parentTconst'), on="parentTconst").join(tvEpisode.drop(['titleType',"endYear"], axis=1).set_index('tconst'), on="tconst").drop(['parentTconst'],axis=1)

In [11]:
TV.head()

Unnamed: 0,tconst,S,E,TVShow,originalTitle,startYear,endYear,eposodeTitle,originalEposode,isAdult,year,minutes,genres,averageRating,numVotes
0,tt0041951,1,9,The Lone Ranger,The Lone Ranger,1949,1957,The Tenderfeet,The Tenderfeet,0,1949,30,Western,7.5,81.0
1,tt0042816,1,17,BBC Sunday-Night Theatre,BBC Sunday-Night Theatre,1950,1959,Othello,Othello,0,1950,135,Drama,,
2,tt0042889,\N,\N,BBC Sunday-Night Theatre,BBC Sunday-Night Theatre,1950,1959,The Tragedy of King Richard II/II,The Tragedy of King Richard II/II,0,1950,145,Drama,,
3,tt0043426,3,42,Studio One,Studio One,1948,1958,Coriolanus,Coriolanus,0,1951,60,Drama,,
4,tt0043631,2,16,BBC Sunday-Night Theatre,BBC Sunday-Night Theatre,1950,1959,The Life of King Henry V,The Life of King Henry V,0,1951,133,Drama,6.8,11.0


# Media with out TV searies

This will have make a new DataFrame without "tvEpisode"
- rename columns to be shorter
- drop columns that are not needed

In [12]:
Videos = tbasics.loc[tbasics.titleType != "tvEpisode"].rename(columns={"startYear": "year", "runtimeMinutes":"minutes", "averageRating":"rating", "numVotes":"votes"}).drop(["endYear", "isAdult"], axis=1)

In [13]:
Videos.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,year,minutes,genres,rating,votes
0,tt0000001,short,Carmencita,Carmencita,1894,1,"Documentary,Short",5.7,1980.0
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,1892,5,"Animation,Short",5.8,265.0
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,1892,4,"Animation,Comedy,Romance",6.5,1835.0
3,tt0000004,short,Un bon bock,Un bon bock,1892,12,"Animation,Short",5.6,179.0
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,1893,1,"Comedy,Short",6.2,2624.0


# Now I'm going to match actors with movies/tv

we need to read the principals and the nbasics

In [14]:
principals = pd.read_csv(principals_file, sep='\t')
nbasics = pd.read_csv(nbasics_file, sep='\t')

# make DataFrame for Actors
List actors first, then the movie

In [15]:
Actors = principals.loc[(principals['category'] == "actor")].drop(['characters','ordering', 'category', 'job'], axis=1).join(nbasics.drop(['primaryProfession', 'knownForTitles'], axis=1).set_index('nconst'), on="nconst" ).sort_values(by=['primaryName']).dropna().rename(columns={"primaryName":"Actor"}).drop(['nconst'], axis=1)
Actors = Actors.join(Videos.set_index('tconst'), on="tconst").dropna()


# Basic Actors table for Videos

In [16]:
Actors.head()

Unnamed: 0,tconst,Actor,birthYear,deathYear,titleType,primaryTitle,originalTitle,year,minutes,genres,rating,votes
32512571,tt21845446,&Audition,\N,\N,tvSeries,&Audition - The Howling,&Audition - The Howling,2022,\N,Reality-TV,6.3,9.0
42817138,tt4572696,'Army' Armstrong,\N,\N,videoGame,Monster Truck Madness,Monster Truck Madness,1996,\N,Sport,6.2,26.0
40193670,tt3440432,'Bazzar Angel Garcia,\N,\N,tvSeries,Tattoos After Dark,Tattoos After Dark,2014,\N,Reality-TV,6.4,117.0
35494263,tt2483220,'Big' Bob Hart,\N,\N,tvSeries,Crazy Hobo,Crazy Hobo,2008,5,Comedy,6.7,22.0
2759252,tt0383929,'Big' LeRoy Mobley,1973,\N,movie,Black August,Black August,2007,116,Drama,6.3,299.0


# Acters list in IMDB for Star Wars

In [17]:
Actors.loc[(Actors['primaryTitle'].str.contains("Star Wars"))]

Unnamed: 0,tconst,Actor,birthYear,deathYear,titleType,primaryTitle,originalTitle,year,minutes,genres,rating,votes
30901432,tt20784210,'Weird Al' Yankovic,1959,\N,tvSpecial,Lego Star Wars Summer Vacation,LEGO Star Wars Summer Vacation,2022,45,"Action,Adventure,Animation",5.6,1505.0
23151884,tt15164924,A.J. LoCascio,1987,\N,tvMovie,Lego Star Wars Terrifying Tales,Lego Star Wars Terrifying Tales,2021,44,"Action,Adventure,Animation",6.1,2516.0
16780547,tt12885438,A.J. LoCascio,1987,\N,tvMovie,The Lego Star Wars Holiday Special,The Lego Star Wars Holiday Special,2020,44,"Action,Adventure,Animation",6.4,7368.0
9038412,tt10461542,A.J. LoCascio,1987,\N,videoGame,Lego Star Wars: The Skywalker Saga,Lego Star Wars: The Skywalker Saga,2022,\N,"Action,Adventure,Comedy",8.8,2301.0
49650884,tt7439508,Aaron Liburd,\N,\N,short,Eternal: A Star Wars Fan Film,Eternal: A Star Wars Fan Film,2017,24,"Sci-Fi,Short",5.4,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...
41189680,tt3853868,Zak Koonce,\N,\N,video,Jedi Party: Star Wars Ep1 - Auralnauts,STAR WARS EP 1: Jedi Party,2013,14,"Comedy,Short",9.2,31.0
49194938,tt7267338,Zander Martin,\N,\N,short,Birth of a Monster: A Star Wars Story,"Star Wars: Tales of the Twin Suns, Episode One...",2019,21,"Sci-Fi,Short",7.8,86.0
30571562,tt20449150,Zebulun Kovach,\N,\N,tvSeries,Star Wars: Gray Trials,Star Wars: Gray Trials,2022,8,Adventure,8.0,8.0
27705144,tt1789918,Ángel Beltrán,\N,\N,short,Star Wars: Extintion,Star Wars: Extinction,2006,11,"Action,Fantasy,Sci-Fi",6.4,48.0


# Several tables for selected Actors

In [18]:
Actors.loc[(Actors['Actor'].str.contains("Samuel L. Jackson"))]

Unnamed: 0,tconst,Actor,birthYear,deathYear,titleType,primaryTitle,originalTitle,year,minutes,genres,rating,votes
923505,tt0107659,Samuel L. Jackson,1948,\N,movie,Loaded Weapon 1,Loaded Weapon 1,1993,84,"Action,Comedy,Crime",6.2,49824.0
45621416,tt5804038,Samuel L. Jackson,1948,\N,movie,I Am Not Your Negro,I Am Not Your Negro,2016,93,"Documentary,History",7.9,22726.0
3448997,tt0486141,Samuel L. Jackson,1948,\N,video,Mr. Incredible and Pals,Mr. Incredible and Pals,2005,4,"Animation,Comedy,Family",6.5,925.0
18889061,tt1371159,Samuel L. Jackson,1948,\N,videoGame,Iron Man 2,Iron Man 2,2010,\N,"Action,Adventure,Crime",6.1,1023.0
1928340,tt0257076,Samuel L. Jackson,1948,\N,movie,S.W.A.T.,S.W.A.T.,2003,117,"Action,Adventure,Crime",6.0,151048.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1706128,tt0221300,Samuel L. Jackson,1948,\N,tvMiniSeries,Jazz,Jazz,2001,114,"Documentary,History,Music",8.6,2380.0
28652668,tt1860353,Samuel L. Jackson,1948,\N,movie,Turbo,Turbo,2013,96,"Adventure,Animation,Comedy",6.4,103890.0
46319286,tt6079772,Samuel L. Jackson,1948,\N,movie,The Protégé,The Protégé,2021,109,"Action,Thriller",6.1,35019.0
3266225,tt0458339,Samuel L. Jackson,1948,\N,movie,Captain America: The First Avenger,Captain America: The First Avenger,2011,124,"Action,Adventure,Sci-Fi",6.9,866013.0


In [19]:
Actors.loc[(Actors['Actor'].str.contains("Jackie Chan"))]

Unnamed: 0,tconst,Actor,birthYear,deathYear,titleType,primaryTitle,originalTitle,year,minutes,genres,rating,votes
684264,tt0080436,Jackie Chan,1954,\N,movie,Battle Creek Brawl,Battle Creek Brawl,1980,95,"Action,Comedy,Crime",5.7,5601.0
781678,tt0091427,Jackie Chan,1954,\N,movie,Heart of Dragon,Long de xin,1985,98,"Action,Comedy,Crime",6.3,4989.0
2771323,tt0386005,Jackie Chan,1954,\N,movie,New Police Story,San ging chaat goo si,2004,123,"Action,Crime,Thriller",6.9,28074.0
781718,tt0091431,Jackie Chan,1954,\N,movie,Armour of God,Lung hing foo dai,1986,88,"Action,Adventure,Comedy",6.9,20986.0
25444266,tt1615160,Jackie Chan,1954,\N,movie,The Foreigner,The Foreigner,2017,113,"Action,Thriller",7.0,118580.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3447967,tt0485976,Jackie Chan,1954,\N,movie,Rob-B-Hood,Bo bui gai wak,2006,136,"Action,Adventure,Comedy",6.6,15175.0
6444858,tt0865556,Jackie Chan,1954,\N,movie,The Forbidden Kingdom,The Forbidden Kingdom,2008,104,"Action,Adventure,Fantasy",6.5,107650.0
675961,tt0079484,Jackie Chan,1954,\N,movie,Dragon Fist,Long quan,1979,97,"Action,Drama",6.1,2523.0
1490582,tt0188708,Jackie Chang,\N,\N,movie,Jackie vs. Bruce to the Rescue,Shuang bei,1982,90,Action,4.5,27.0


In [20]:
Actors.loc[(Actors['Actor'].str.contains("Jean-Claude Van Damme"))]

Unnamed: 0,tconst,Actor,birthYear,deathYear,titleType,primaryTitle,originalTitle,year,minutes,genres,rating,votes
27216779,tt1735862,Jean-Claude Van Damme,1960,\N,movie,Dragon Eyes,Dragon Eyes,2012,91,"Action,Crime,Drama",4.4,6190.0
981781,tt0114576,Jean-Claude Van Damme,1960,\N,movie,Sudden Death,Sudden Death,1995,111,"Action,Crime,Thriller",5.8,35870.0
37583551,tt2739502,Jean-Claude Van Damme,1960,\N,tvMovie,The Making of 'Hard Target',The Making of 'Hard Target',1993,\N,Documentary,8.2,11.0
857825,tt0100029,Jean-Claude Van Damme,1960,\N,movie,Lionheart,Lionheart,1990,108,"Action,Crime,Drama",6.2,37988.0
3293598,tt0462329,Jean-Claude Van Damme,1960,\N,video,The Hard Corps,The Hard Corps,2006,110,"Action,Thriller",5.2,6385.0
837048,tt0097659,Jean-Claude Van Damme,1960,\N,movie,Kickboxer,Kickboxer,1989,97,"Action,Sport,Thriller",6.4,58085.0
1402332,tt0176269,Jean-Claude Van Damme,1960,\N,movie,Universal Soldier: The Return,Universal Soldier: The Return,1999,83,"Action,Sci-Fi",4.2,30091.0
2650165,tt0367478,Jean-Claude Van Damme,1960,\N,movie,Wake of Death,Wake of Death,2004,91,"Action,Adventure,Drama",5.5,9809.0
45535897,tt5767628,Jean-Claude Van Damme,1960,\N,movie,Kill 'Em All,Kill 'Em All,2017,100,"Action,Crime,Mystery",4.3,2783.0
50872400,tt7903530,Jean-Claude Van Damme,1960,\N,movie,We Die Young,We Die Young,2019,92,"Action,Crime,Drama",5.2,4796.0


# Make a DF of Cinematographers


In [21]:
DOP = principals.loc[(principals['category'] == "cinematographer")].drop(['characters','ordering', 'category', 'job'], axis=1).join(nbasics.drop(['primaryProfession', 'knownForTitles'], axis=1).set_index('nconst'), on="nconst" ).sort_values(by=['primaryName']).dropna().rename(columns={"primaryName":"Crew"}).drop(['nconst'], axis=1)
DOP = DOP.join(Videos.set_index('tconst'), on="tconst").dropna()


# This someone I work with in IATSE
- this is a list of Movies one of my co-workers has worked on.

In [22]:
DOP.loc[(DOP['Crew'].str.contains("John Holosko"))]

Unnamed: 0,tconst,Crew,birthYear,deathYear,titleType,primaryTitle,originalTitle,year,minutes,genres,rating,votes
1348940,tt0167959,John Holosko,\N,\N,tvMovie,Giving Up the Ghost,Giving Up the Ghost,1998,96,"Comedy,Crime,Drama",5.9,79.0
2144903,tt0289758,John Holosko,\N,\N,movie,Phase IV,Phase IV,2002,103,"Action,Drama,Thriller",4.8,478.0
2249068,tt0305168,John Holosko,\N,\N,short,6ix,6ix,1999,11,"Comedy,Horror,Short",6.6,112.0
6784755,tt0910873,John Holosko,\N,\N,video,An Evening with Kevin Smith 2: Evening Harder,An Evening with Kevin Smith 2: Evening Harder,2006,239,"Comedy,Documentary",7.7,3131.0
12917839,tt1160750,John Holosko,\N,\N,tvSpecial,One x One Gala,One x One Gala,2007,93,Music,5.3,11.0
45470464,tt5738768,John Holosko,\N,\N,movie,Nursery Rhyme of a Madman,Nursery Rhyme of a Madman,2017,98,"Comedy,Crime,Drama",6.8,17.0
3025604,tt0423559,John Holosko,\N,\N,movie,"Ydessa, the Bears and etc...","Ydessa, les ours et etc...",2004,44,Documentary,7.1,407.0
2172173,tt0293664,John Holosko,\N,\N,movie,Treed Murray,Treed Murray,2001,90,"Drama,Thriller",7.2,1567.0
1019829,tt0119093,John Holosko,\N,\N,tvMovie,Face Down,Face Down,1997,97,"Crime,Drama,Mystery",5.5,224.0
1799212,tt0236823,John Holosko,\N,\N,movie,Undertaker,Undertaker,1996,107,"Action,Thriller",5.0,8.0
