# Basic Exploration Of Data And Potential Uses

In this document we will:
- Look at each CSV file.
- Comment on its strengths and weaknesses.
- Make cursory notes for potential research uses.

In [3]:
import numpy as np
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
%matplotlib inline

In [71]:
# This pulls all the .csv files into one list of dataframes.

csv_files = glob('zippedData/*.csv.gz')
csv_dfs = [pd.read_csv(f) for f in csv_files]
len(csv_dfs)

9

In [32]:
#BOM Grosses
# Very, very useful, but also read notes for TN Budgets. Studio info
# useful too. Could say "Horror made by studio X does well while
# horror made by studio Y does poorly. Invest in studio X for
# horror films." Or something like that.

print(len(csv_dfs[0]))
csv_dfs[0].head(10)

3387


Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010
5,The Twilight Saga: Eclipse,Sum.,300500000.0,398000000,2010
6,Iron Man 2,Par.,312400000.0,311500000,2010
7,Tangled,BV,200800000.0,391000000,2010
8,Despicable Me,Uni.,251500000.0,291600000,2010
9,How to Train Your Dragon,P/DW,217600000.0,277300000,2010


In [31]:
# IMDB Names
# Gender data, only marked with actor-actress. But some actors also
# direct, write, etc. Could be used to draw out more gendered
# conclusions.

# So many people lacking birth and death years, almost feel like
# useless columns.

csv_dfs[1].head(10)

Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
3,nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
4,nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"
5,nm0062879,Ruel S. Bayani,,,"director,production_manager,miscellaneous","tt2590280,tt0352080,tt0216559,tt2057445"
6,nm0063198,Bayou,,,actor,"tt6579724,tt0093116"
7,nm0063432,Stevie Be-Zet,,,"composer,soundtrack","tt3106212,tt0478239,tt0264917,tt1626606"
8,nm0063618,Jeff Beal,1963.0,,"composer,music_department,soundtrack","tt0183659,tt2545118,tt0384766,tt1856010"
9,nm0063750,Lindsay Beamish,,,"actress,miscellaneous","tt0404826,tt0111756,tt0367027,tt1492842"


In [37]:
# IMDB AKA's
# Basically just what movies are called in other countries. Don't
# see much use in this one.

print(len(csv_dfs[2]))
csv_dfs[2].head(20)

331703


Unnamed: 0,title_id,ordering,title,region,language,types,attributes,is_original_title
0,tt0369610,10,Джурасик свят,BG,bg,,,0.0
1,tt0369610,11,Jurashikku warudo,JP,,imdbDisplay,,0.0
2,tt0369610,12,Jurassic World: O Mundo dos Dinossauros,BR,,imdbDisplay,,0.0
3,tt0369610,13,O Mundo dos Dinossauros,BR,,,short title,0.0
4,tt0369610,14,Jurassic World,FR,,imdbDisplay,,0.0
5,tt0369610,15,Jurassic World,GR,,imdbDisplay,,0.0
6,tt0369610,16,Jurassic World,IT,,imdbDisplay,,0.0
7,tt0369610,17,Jurski svijet,HR,,imdbDisplay,,0.0
8,tt0369610,18,Olam ha'Yura,IL,he,imdbDisplay,,0.0
9,tt0369610,19,Jurassic World: Mundo Jurásico,MX,,imdbDisplay,,0.0


In [34]:
# IMDB Basics
# Good for genres, runtimes, and release year. Could be used to track
# what genres are increasing and either invest in those films or
# actively not invest in them (because market oversaturated).

print(len(csv_dfs[3]))
csv_dfs[3].head(10)

146144


Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"
5,tt0111414,A Thin Life,A Thin Life,2018,75.0,Comedy
6,tt0112502,Bigfoot,Bigfoot,2017,,"Horror,Thriller"
7,tt0137204,Joe Finds Grace,Joe Finds Grace,2017,83.0,"Adventure,Animation,Comedy"
8,tt0139613,O Silêncio,O Silêncio,2012,,"Documentary,History"
9,tt0144449,Nema aviona za Zagreb,Nema aviona za Zagreb,2012,82.0,Biography


In [33]:
# IMDB Crew
# Titles and directors and writers. Also linked together in the
# IMDB Principals data. Here it's just in a line together is all.

print(len(csv_dfs[4]))
csv_dfs[4].head(10)

146144


Unnamed: 0,tconst,directors,writers
0,tt0285252,nm0899854,nm0899854
1,tt0438973,,"nm0175726,nm1802864"
2,tt0462036,nm1940585,nm1940585
3,tt0835418,nm0151540,"nm0310087,nm0841532"
4,tt0878654,"nm0089502,nm2291498,nm2292011",nm0284943
5,tt0879859,nm2416460,
6,tt0996958,nm2286991,"nm2286991,nm2651190"
7,tt0999913,nm0527109,"nm0527109,nm0329051,nm0001603,nm0930684"
8,tt10003792,nm10539228,nm10539228
9,tt10005130,nm10540239,"nm5482263,nm10540239"


In [36]:
# IMDB Principals
# Gendered data here with actor-actress.

# Not sure exactly what "ordering" refers to. Amount of pay?
# Appearance order in credits? Either way, each unique movie
# starts from 1 and counts up. "Tconst" is unique ID for titles.

print(len(csv_dfs[5]))
csv_dfs[5].head(20)

1028186


Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,
3,tt0323808,10,nm0059247,editor,,
4,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"
5,tt0323808,2,nm2694680,actor,,"[""Steve Thomson""]"
6,tt0323808,3,nm0574615,actor,,"[""Sir Lachlan Morrison""]"
7,tt0323808,4,nm0502652,actress,,"[""Lady Delia Morrison""]"
8,tt0323808,5,nm0362736,director,,
9,tt0323808,6,nm0811056,producer,producer,


In [16]:
#IMDB Ratings
#Averaged ratings. Lots of rows with very low vote counts.

print(len(csv_dfs[6]))
csv_dfs[6].head()

73856


Unnamed: 0,tconst,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21


In [39]:
#TMDB Movies
#Good for grabbing genres with popularity of film.

#If we do anything with votes, we should drop all rows with
#less than 300 votes. This is the default setting on the site's
#search-by-top-rated page. Seems like a good arbitrary number.

print(len(csv_dfs[7]))
csv_dfs[7].head()

26517


Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [68]:
# TN Movie Budgets
# Better than BOM insofar as it has budget and worldiwde. Worse in
# that it's lacking studios. Could merge the DF's but if there's
# minor spelling inconsistencies in the strings we could have
# lots of duplicate or near-duplicate rows. Not ideal.

# Almost twice as many films here as in BOM as well. I propose we
# left join TN-BOM and go from there.

print(len(csv_dfs[8]))
csv_dfs[8].head()

5782


Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [65]:
# Calculating number of films where worldwide gross = $0.
# I believe this means not that they were unprofitable,
# but that they never screened abroad. These rows should be dropped.

x = csv_dfs[8]
wwgn = 0
for money in x['worldwide_gross']:
    if money == '$0':
        wwgn += 1
print(wwgn, "with $0 ----", 100*(wwgn/len(csv_dfs[8])), "% of total")

367 with $0 ---- 6.347284676582497 % of total
