Example EDA for IMDB TV shows:
- https://towardsdatascience.com/imdb-tv-show-data-analysis-4961ef39d174
- https://towardsdatascience.com/imdb-television-show-data-analysis-part-2-39ebf47977ff

In [1]:
# set the width to full
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# import dependencies and libraries
import pandas as pd
import numpy as np

In [3]:
# set parameters to show max rows and columns and change format numbers to show three digits after decimal
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None
#pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [4]:
# Source: https://datasets.imdbws.com/

# read the datasets
basics = pd.read_csv('data/title.basics.tsv.gz', compression='gzip', header=0, sep='\t')
ratings = pd.read_csv('data/title.ratings.tsv.gz', compression='gzip', header=0, sep='\t')
principals = pd.read_csv('data/title.principals.tsv.gz', compression='gzip', header=0, sep='\t')
episodes = pd.read_csv('data/title.episode.tsv.gz', compression='gzip', header=0, sep='\t')
names = pd.read_csv('data/name.basics.tsv.gz', compression='gzip', header=0, sep='\t')
crews = pd.read_csv('data/title.crew.tsv.gz', compression='gzip', header=0, sep='\t')
akas = pd.read_csv('data/title.akas.tsv.gz', compression='gzip', header=0, sep='\t')

In [5]:
# first take the types of media that we are interested in
basics_interested = basics[basics.titleType.isin(['movie', 'tvMiniSeries', 'tvSeries'])]
dictionary_for_mapping_mediaTypes = {'movie': "FILM", 'tvMiniSeries': "TV", 'tvSeries': "TV"}
basics_interested["titleType"].replace(dictionary_for_mapping_mediaTypes, inplace=True)
basics_interested["primaryTitle"] = basics_interested["primaryTitle"].str.lower()

In [6]:
# check the imdb id of TV or FILM by searching some keywords in title
basics_interested[(basics_interested.primaryTitle.str.contains('without glory')) & (basics_interested.titleType=='FILM') ]   # & (basics_interested.startYear=='')

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
170810,tt0176753,FILM,heroes without glory,I giardini del diavolo,0,1971,\N,92,"Drama,War"
770005,tt0794282,FILM,fama: heroism without glory,Fama: Heroism Without Glory,0,2004,\N,52,"Biography,Documentary,History"


In [7]:
def dequote(s):
    """
    If a string has single or double quotes around it, remove them.
    Make sure the pair of quotes match.
    If a matching pair of quotes is not found, return the string unchanged.
    """
    if (s[0] == s[-1]) and s.startswith(("'", '"')):
        return s[1:-1]
    return s

In [8]:
# read the main data [source](https://www.kaggle.com/ksb357/military-hollywood-collaboration-database)
military_hollywood = pd.read_csv('data/military-hollywood-full.csv', delimiter=',')

In [9]:
military_hollywood['Title'] = military_hollywood['Title'].apply(lambda x: dequote(x))
military_hollywood['Title'] = military_hollywood['Title'].str.lower()

In [10]:
military_hollywood.shape

(865, 6)

In [11]:
# this doesn't work with the join on these columns 
pd.merge(military_hollywood, basics_interested[['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'startYear', 'genres']], 
         how ='left', left_on=['Title', 'Media Type'], right_on=['primaryTitle', 'titleType'])

Unnamed: 0,Title,Subtitle,Status,Media Type,Year,Remarks,tconst,titleType,primaryTitle,originalTitle,startYear,genres
0,1968,,OTH,FILM,,THE FILM STARTED OUT VERY NEGATIVE FOR THE ARM...,tt0166470,FILM,1968,1968,1989,Documentary
1,1968,,OTH,FILM,,THE FILM STARTED OUT VERY NEGATIVE FOR THE ARM...,tt7675404,FILM,1968,1968,2018,"History,Sport"
2,"1,000 men and a baby",,APP,TV,1997.0,VERY POSITIVE DEPICTION OF NAVY IN THIS KOREAN...,,,,,,
3,1st force,,OTH,FILM,,INITIALLY DOD AND USMC WERE INCLINED TO SUPPOR...,,,,,,
4,24,22,APP,TV,2004.0,APPROVED FILMING FOR ONE DAY WITH TWO MARINE C...,tt0285331,TV,24,24,2001,"Action,Crime,Drama"
5,3rd degree,,APP,TV,1989.0,PERSONNEL APPEARED ON THIS GAME SHOW AT THE EX...,,,,,,
6,50/50,,DEN,FILM,,NEVER WAS OFFICIALLY SUBMITTED TO DOD. IT WAS ...,tt0076122,FILM,50/50,Halbe-Halbe,1977,\N
7,50/50,,DEN,FILM,,NEVER WAS OFFICIALLY SUBMITTED TO DOD. IT WAS ...,tt10650688,FILM,50/50,Aalukku Paathi 50/50,2019,"Comedy,Drama,Horror"
8,50/50,,DEN,FILM,,NEVER WAS OFFICIALLY SUBMITTED TO DOD. IT WAS ...,tt1306980,FILM,50/50,50/50,2011,"Comedy,Drama,Romance"
9,50/50,,DEN,FILM,,NEVER WAS OFFICIALLY SUBMITTED TO DOD. IT WAS ...,tt1734060,FILM,50/50,50/50,2010,Documentary


In [12]:
basics.groupby('titleType').count()

Unnamed: 0_level_0,tconst,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
titleType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
audiobook,1,1,1,1,1,1,1,1
episode,1,1,1,1,1,1,1,1
movie,570842,570842,570842,570842,570842,570842,570842,570842
radioSeries,1,1,1,1,1,1,1,1
short,799224,799224,799224,799224,799224,799224,799224,799224
tvEpisode,5593217,5593209,5593209,5593217,5593217,5593217,5593217,5593207
tvMiniSeries,36291,36291,36291,36291,36291,36291,36291,36291
tvMovie,130437,130437,130437,130437,130437,130437,130437,130437
tvSeries,203265,203265,203265,203265,203265,203265,203265,203265
tvShort,9611,9611,9611,9611,9611,9611,9611,9611


In [13]:
# read military hollywood data where imdb_id added as column
military_hollywood_imdbid = pd.read_csv('military-hollywood-full_imdbidAdded.csv', delimiter=',')
military_hollywood_imdbid.Year = military_hollywood_imdbid.Year.fillna('')

In [14]:
# TO DO : 

# Fill the blanks IMDB_ID in the military-hollywood-full_imdbidAdded.csv file.
military_hollywood_imdbid

Unnamed: 0,Title,IMDB_ID,Subtitle,Status,Media Type,Year,Remarks
0,"""1968""",,,OTH,FILM,,THE FILM STARTED OUT VERY NEGATIVE FOR THE ARM...
1,"1,000 MEN AND A BABY",,,APP,TV,1997.0,VERY POSITIVE DEPICTION OF NAVY IN THIS KOREAN...
2,1ST FORCE,,,OTH,FILM,,INITIALLY DOD AND USMC WERE INCLINED TO SUPPOR...
3,24,,22,APP,TV,2004.0,APPROVED FILMING FOR ONE DAY WITH TWO MARINE C...
4,3RD DEGREE,,,APP,TV,1989.0,PERSONNEL APPEARED ON THIS GAME SHOW AT THE EX...
5,50/50,tt1306980,,DEN,FILM,,NEVER WAS OFFICIALLY SUBMITTED TO DOD. IT WAS ...
6,55 DAYS AT PEKING,tt0056800,,OTH,FILM,1963.0,PERIOD PIECE ABOUT MARINES IN CHINA DURING THE...
7,84 CHARLIE MOPIC,tt0096744,,OTH,FILM,1989.0,VIETNAM MOVIE ABOUT MOTION PICTURE CORRESPONDE...
8,A FEW GOOD MEN,tt0104257,,LIM,FILM,1992.0,"INACCURATE, NEGATIVE PORTRAVALS OF MARINES. PR..."
9,A MIDNIGHT CLEAR,tt0102443,,DEN,FILM,1992.0,DECLINED ASSISTANCE (REQUEST FOR WW II FACILIT...


Source: https://www.kaggle.com/stephanerappeneau/350-000-movies-from-themoviedborg

In [38]:
movies = pd.read_csv('data/AllMoviesDetailsCleaned.csv', encoding='utf-8-sig', sep=';', engine='python', parse_dates=['release_date'])

In [33]:
movies.head()

Unnamed: 0,id,budget,genres,imdb_id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,production_companies_number,production_countries_number,spoken_languages_number
0,2,0,Drama|Crime,tt0094675,fi,Ariel,Taisto Kasurinen is a Finnish coal miner whose...,0.823904,Villealfa Filmproduction Oy,Finland,1988-10-21,0,69.0,suomi,Released,,Ariel,7.1,40,2,1,2
1,3,0,Drama|Comedy,tt0092149,fi,Varjoja paratiisissa,"An episode in the life of Nikander, a garbage ...",0.47445,Villealfa Filmproduction Oy,Finland,1986-10-16,0,76.0,English,Released,,Shadows in Paradise,7.0,32,1,1,3
2,5,4000000,Crime|Comedy,tt0113101,en,Four Rooms,It's Ted the Bellhop's first night on the job....,1.698,Miramax Films,United States of America,1995-12-25,4300000,98.0,English,Released,Twelve outrageous guests. Four scandalous requ...,Four Rooms,6.5,485,2,1,1
3,6,0,Action|Thriller|Crime,tt0107286,en,Judgment Night,"While racing to a boxing match, Frank, Mike, J...",1.32287,Universal Pictures,Japan,1993-10-15,12136938,110.0,English,Released,Don't move. Don't whisper. Don't even breathe.,Judgment Night,6.5,69,3,2,1
4,8,42000,Documentary,tt0825671,en,Life in Loops (A Megacities RMX),Timo Novotny labels his new project an experim...,0.054716,inLoops,Austria,2006-01-01,0,80.0,English,Released,A Megacities remix.,Life in Loops (A Megacities RMX),6.4,4,1,1,5
