Example EDA for IMDB TV shows:
- https://towardsdatascience.com/imdb-tv-show-data-analysis-4961ef39d174
- https://towardsdatascience.com/imdb-television-show-data-analysis-part-2-39ebf47977ff

In [1]:
# set the width to full
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# import dependencies and libraries
import pandas as pd
import numpy as np

In [3]:
# set parameters to show max rows and columns and change format numbers to show three digits after decimal
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None
#pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [4]:
# Source: https://datasets.imdbws.com/

# read the datasets
basics = pd.read_csv('data/title.basics.tsv.gz', compression='gzip', header=0, sep='\t')
ratings = pd.read_csv('data/title.ratings.tsv.gz', compression='gzip', header=0, sep='\t')
#principals = pd.read_csv('data/title.principals.tsv.gz', compression='gzip', header=0, sep='\t')
#episodes = pd.read_csv('data/title.episode.tsv.gz', compression='gzip', header=0, sep='\t')
names = pd.read_csv('data/name.basics.tsv.gz', compression='gzip', header=0, sep='\t')
crews = pd.read_csv('data/title.crew.tsv.gz', compression='gzip', header=0, sep='\t')
#akas = pd.read_csv('data/title.akas.tsv.gz', compression='gzip', header=0, sep='\t')

In [5]:
basics.groupby('titleType').count()

Unnamed: 0_level_0,tconst,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
titleType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
audiobook,1,1,1,1,1,1,1,1
episode,1,1,1,1,1,1,1,1
movie,570842,570842,570842,570842,570842,570842,570842,570842
radioSeries,1,1,1,1,1,1,1,1
short,799224,799224,799224,799224,799224,799224,799224,799224
tvEpisode,5593217,5593209,5593209,5593217,5593217,5593217,5593217,5593207
tvMiniSeries,36291,36291,36291,36291,36291,36291,36291,36291
tvMovie,130437,130437,130437,130437,130437,130437,130437,130437
tvSeries,203265,203265,203265,203265,203265,203265,203265,203265
tvShort,9611,9611,9611,9611,9611,9611,9611,9611


In [6]:
# first take the types of media that we are interested in
basics_interested = basics[basics.titleType.isin(['movie', 'tvMiniSeries', 'tvSeries', 'tvMovie', 'video', 'tvEpisode', 'tvSpecial', 'short'])]

In [7]:
# read the main data [source](https://www.kaggle.com/ksb357/military-hollywood-collaboration-database)
military_hollywood = pd.read_csv('military-hollywood-full_imdbidAdded.csv', delimiter=',', usecols=['Title', 'IMDB_ID', 'Status', 'Media Type', 'Year','Remarks'])

In [8]:
military_hollywood.shape

(858, 6)

In [9]:
military_hollywood.head()

Unnamed: 0,Title,IMDB_ID,Status,Media Type,Year,Remarks
0,"""1968""",,OTH,FILM,,THE FILM STARTED OUT VERY NEGATIVE FOR THE ARM...
1,"1,000 MEN AND A BABY",tt0133231,APP,TV,1997.0,VERY POSITIVE DEPICTION OF NAVY IN THIS KOREAN...
2,1ST FORCE,,OTH,FILM,,INITIALLY DOD AND USMC WERE INCLINED TO SUPPOR...
3,24,tt0285331,APP,TV,2004.0,APPROVED FILMING FOR ONE DAY WITH TWO MARINE C...
4,3RD DEGREE,tt0098469,APP,TV,1989.0,PERSONNEL APPEARED ON THIS GAME SHOW AT THE EX...


In [10]:
# join main dataset and title basics dataset for release year, genres, and title type informations
aggregated_data = pd.merge(military_hollywood, basics_interested[['tconst', 'startYear', 'genres', 'titleType']], how ='left', left_on='IMDB_ID', right_on='tconst')

In [11]:
# check if any rows with imdb_id not null but tconst is null
aggregated_data[(aggregated_data.IMDB_ID.notna())&(aggregated_data.tconst.isna())]

Unnamed: 0,Title,IMDB_ID,Status,Media Type,Year,Remarks,tconst,startYear,genres,titleType


In [12]:
aggregated_data[['Media Type', 'titleType']].drop_duplicates()#[(aggregated_data.IMDB_ID.notna())&(aggregated_data.tconst.isna())]

Unnamed: 0,Media Type,titleType
0,FILM,
1,TV,tvMovie
3,TV,tvSeries
5,FILM,movie
15,OTH,video
23,TV,tvEpisode
26,TV,
30,TV,tvSpecial
36,TV,short
43,FILM,tvSeries


In [13]:
# join merged dataset above and crews dataset for directors informations 
aggregated_data1 = pd.merge(aggregated_data, crews[['tconst', 'directors']], how ='left', left_on='IMDB_ID', right_on='tconst')

In [14]:
aggregated_data1.shape

(858, 12)

In [16]:
# join the aggregated data 2 with ratings to have the average rating information
aggregated_data3 = pd.merge(aggregated_data1, ratings[['tconst', 'averageRating']], how ='left', left_on='IMDB_ID', right_on='tconst')

In [17]:
# create two subsets of the aggregated_data3 to be able to request plot and keywords information for each imdb_id
aggregated_data3_imdbid_isna = aggregated_data3.loc[(aggregated_data3.IMDB_ID.isna()) | (aggregated_data3.IMDB_ID=='')]
aggregated_data3_imdbid_notna = aggregated_data3[~aggregated_data3.index.isin(aggregated_data3_imdbid_isna.index)]

In [18]:
aggregated_data3_imdbid_isna.shape

(73, 14)

In [19]:
aggregated_data3_imdbid_notna.shape

(785, 14)

In [20]:
aggregated_data3_imdbid_isna.head()

Unnamed: 0,Title,IMDB_ID,Status,Media Type,Year,Remarks,tconst_x,startYear,genres,titleType,tconst_y,directors,tconst,averageRating
0,"""1968""",,OTH,FILM,,THE FILM STARTED OUT VERY NEGATIVE FOR THE ARM...,,,,,,,,
2,1ST FORCE,,OTH,FILM,,INITIALLY DOD AND USMC WERE INCLINED TO SUPPOR...,,,,,,,,
10,A MORAL ISSUE,,DEN,FILM,,THERE ARE NO RECORDS OF DOD EVER APPROVING THE...,,,,,,,,
26,"AIRPOWER VIETNAM, THE REAP TOP GUN",,APP,TV,,THE REQUEST FOR FILMING ON LUKE AFB WAS GRANTE...,,,,,,,,
28,ALL THE UNSUNG HEROES,,APP,TV,1991.0,DOD APPROVED FOR STOCK FOOTAGE. THIS VIDEO WAS...,,,,,,,,


In [21]:
aggregated_data3_imdbid_isna['plot'] = np.NaN
aggregated_data3_imdbid_isna['released_dates'] = np.NaN
aggregated_data3_imdbid_isna['directors_name'] = np.NaN
aggregated_data3_imdbid_isna['awards'] = np.NaN

In [22]:
# Run only once to request the plot and released dates information

#. import requests
#. 
#. params = {'plot':'full'}
#. apiKey = '43487f66' # your api key here
#. 
#. plots = []
#. releasedDates = []
#. keys = []
#. directors = []
#. genres = []
#. awards = []
#. posters = []
#. ratings = []
#. movieType = []
#. 
#. for index, row in aggregated_data3_imdbid_notna.iterrows():
#.     data_URL = 'http://www.omdbapi.com/?i='+row['IMDB_ID']+'&apikey='+apiKey
#.     response = requests.get(data_URL,params=params).json()
#.     if response['Response']=='True':
#.         keys.append(row['IMDB_ID'])
#.         plots.append(response['Plot'])
#.         releasedDates.append(response['Released'])    
#.         directors.append(response['Director'])   
#.         genres.append(response['Genre'])    
#.         awards.append(response['Awards'])
#.         posters.append(response['Poster'])
#.         ratings.append(response['imdbRating'])
#.         movieType.append(response['Type'])
#.         
#. requested = pd.DataFrame({'ids' : keys, 'plot': plots, 'released_dates' : releasedDates, 'directors' : directors, 'genres' : genres, 'awards' : awards,
#.                           'posters' : posters, 'ratings' : ratings, 'movieType' : movieType})
#. import pickle 
#. requested.to_pickle('requested.pkl')

In [23]:
read_requested = pd.read_pickle('requested.pkl')

In [24]:
print('There are {} ({}) imdb ids have no plot information out of aggregated_data3_imdbid_notna dataset.'.format(len(set(aggregated_data3_imdbid_notna.IMDB_ID)-set(read_requested.ids)),
                                                                                                                 set(aggregated_data3_imdbid_notna.IMDB_ID)-set(read_requested.ids)))

There are 4 ({'tt1829487', 'tt1829483', 'tt13891322', 'tt6349394'}) imdb ids have no plot information out of aggregated_data3_imdbid_notna dataset.


In [25]:
read_requested.shape

(781, 9)

In [26]:
read_requested.rename(columns = {'directors':'directors_name'}, inplace = True)

In [27]:
aggregated_data3_imdbid_notna = pd.merge(aggregated_data3_imdbid_notna, read_requested[['ids', 'plot', 'released_dates', 'directors_name', 'awards']], how ='left', left_on='IMDB_ID', right_on='ids').drop_duplicates()

In [28]:
# combine the divided data back together
aggregated_final = aggregated_data3_imdbid_notna.append(aggregated_data3_imdbid_isna, ignore_index=True)

In [29]:
aggregated_final = aggregated_final.drop(['tconst_x', 'tconst_y', 'tconst', 'ids'], axis = 1)

In [30]:
# replace \N as nan 
directors_dict_clean = {'\\N': np.NaN}
aggregated_final.directors.replace(directors_dict_clean, inplace=True)
aggregated_final.startYear.replace(directors_dict_clean, inplace=True)

In [31]:
directors_dict_clean_2 = {'N/A': np.NaN}
aggregated_final.directors_name.replace(directors_dict_clean_2, inplace=True)

In [33]:
aggregated_final.shape

(858, 15)

In [34]:
# create a dictionary with director names and their ids
df_director = names[['nconst', 'primaryName']].set_index('nconst')
dict_director = df_director.to_dict()
dict_director = dict_director['primaryName']

In [35]:
# divide data into 2 to apply the dictionary defined above
aggregated_final_directors_notna_name_isna = aggregated_final.loc[(aggregated_final.directors_name.isna()) & (aggregated_final.directors.notna())]
aggregated_final_directors_rest = aggregated_final[~aggregated_final.index.isin(aggregated_final_directors_notna_name_isna.index)]

In [61]:
#demo = aggregated_final_directors_notna_name_isna[['directors', 'directors_name']].reset_index(drop=True)

In [36]:
def replace_directorIDs_with_names(list_, dictionary):
    """
    Takes list of strings and a dictionary and it replaces the strings which the key values in the dictionary.
    """
    for index, data in enumerate(list_):
        for key, value in dictionary.items():
            if key in data:
                list_[index]=data.replace(key, dictionary[key])
    return list_

In [37]:
director_list = [x.split(',') for x in aggregated_final_directors_notna_name_isna.directors.to_list()]

directors_names = []

for sublist in director_list:
    directors_names.append(replace_directorIDs_with_names(sublist, dict_director))

In [38]:
aggregated_final_directors_notna_name_isna.directors_name = [(', ').join(a) for a in directors_names]

In [42]:
aggregated_final_directors_notna_name_isna.shape

(27, 15)

In [43]:
aggregated_final_directors_rest.shape

(831, 15)

In [44]:
# combine the divided data back together
aggregated_ = aggregated_final_directors_notna_name_isna.append(aggregated_final_directors_rest, ignore_index=True)

In [45]:
aggregated_.shape

(858, 15)

In [46]:
aggregated_.to_pickle('movies_final_aggregated.pkl')

Source: https://www.kaggle.com/stephanerappeneau/350-000-movies-from-themoviedborg

In [38]:
#movies = pd.read_csv('data/AllMoviesDetailsCleaned.csv', encoding='utf-8-sig', sep=';', engine='python', parse_dates=['release_date'])

In [30]:
#movies.head()