Imports and Downloading Data

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [4]:
## Making "Data" Folder to save IMDB files
os.makedirs("Data/",exist_ok=True)
os.listdir("Data/")

['title_basics_cleaned.csv.gz',
 'title_ratings_cleaned.csv.gz',
 'tmdb_api_results_2000.json',
 'final_tmdb_data_2000.csv.gz',
 'tmdb_api_results_2001.json',
 'final_tmdb_data_2001.csv.gz',
 'title_akas_cleaned.csv.gz']

In [5]:
## title basics 
url_title_basics = 'https://datasets.imdbws.com/title.basics.tsv.gz'
basics = pd.read_csv(url_title_basics, sep='\t',low_memory=True)
basics

  basics = pd.read_csv(url_title_basics, sep='\t',low_memory=True)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
9455644,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,\N,\N,"Action,Drama,Family"
9455645,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
9455646,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
9455647,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [6]:
basics.replace({'\\N':np.nan},inplace=True)

In [7]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9455649 entries, 0 to 9455648
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 649.3+ MB


In [8]:
## title ratings
url_title_ratings ="https://datasets.imdbws.com/title.ratings.tsv.gz"
ratings = pd.read_csv(url_title_ratings,sep='\t',low_memory=True)
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1928
1,tt0000002,5.8,261
2,tt0000003,6.5,1744
3,tt0000004,5.6,176
4,tt0000005,6.2,2556
...,...,...,...
1258351,tt9916690,7.4,6
1258352,tt9916720,5.4,287
1258353,tt9916730,8.0,8
1258354,tt9916766,6.7,21


In [9]:
## title AKAs
url_title_akas ="https://datasets.imdbws.com/title.akas.tsv.gz"
akas = pd.read_csv(url_title_akas,sep='\t',low_memory=True)
akas

  akas = pd.read_csv(url_title_akas,sep='\t',low_memory=True)


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0
...,...,...,...,...,...,...,...,...
34227293,tt9916852,5,Episódio #3.20,PT,pt,\N,\N,0
34227294,tt9916852,6,Episodio #3.20,IT,it,\N,\N,0
34227295,tt9916852,7,एपिसोड #3.20,IN,hi,\N,\N,0
34227296,tt9916856,1,The Wind,DE,\N,imdbDisplay,\N,0


In [10]:
## check nulls
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1266376
endYear           9355644
runtimeMinutes    6752370
genres             431724
dtype: int64

In [11]:
## Replace "\N" with np.nan
basics.replace({'\\N':np.nan},inplace=True)
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1266376
endYear           9355644
runtimeMinutes    6752370
genres             431724
dtype: int64

In [12]:
# Replace "\N" with np.nan (if any)
ratings.replace({'\\N':np.nan},inplace=True)
ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [13]:
# Replace "\N" with np.nan (if any)
akas.replace({'\\N':np.nan},inplace=True)
akas.isna().sum()

titleId                   0
ordering                  0
title                     5
region              1872941
language            6431023
types              28809481
attributes         33975793
isOriginalTitle        2109
dtype: int64

In [14]:
## Eliminate movies that are null for runtimeMinute, genres
basics = basics.dropna(subset=['runtimeMinutes','genres'])
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear          116899
endYear           2581232
runtimeMinutes          0
genres                  0
dtype: int64

In [15]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2628341 entries, 0 to 9455648
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 200.5+ MB


In [16]:
## keep only titleType==Movie
basics = basics.loc[ basics['titleType']=='movie']
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
...,...,...,...,...,...,...,...,...,...
9455414,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"
9455498,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,,123,Drama
9455539,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,,57,Documentary
9455566,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,,100,Documentary


In [17]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 373848 entries, 8 to 9455599
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          373848 non-null  object
 1   titleType       373848 non-null  object
 2   primaryTitle    373848 non-null  object
 3   originalTitle   373848 non-null  object
 4   isAdult         373848 non-null  object
 5   startYear       367849 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  373848 non-null  object
 8   genres          373848 non-null  object
dtypes: object(9)
memory usage: 28.5+ MB


In [18]:
### Convert startyear to numeric for slicing
## convert numeric features
basics['startYear'] = basics['startYear'].astype(float)

In [19]:
## keep startYear 2000-2022
basics = basics[(basics['startYear']>=2000)&(basics['startYear']<2022)]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021.0,,133,Documentary
34804,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61117,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
66337,tt0067683,movie,Workers '71: Nothing About Us Without Us,Robotnicy 1971 - Nic o nas bez nas,0,2006.0,,47,Documentary
67670,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
...,...,...,...,...,...,...,...,...,...
9455414,tt9916362,movie,Coven,Akelarre,0,2020.0,,92,"Drama,History"
9455498,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019.0,,123,Drama
9455539,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015.0,,57,Documentary
9455566,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007.0,,100,Documentary


In [20]:
## Eliminate movies that include  "Documentary" in genre 
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34804,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61117,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
67670,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
77965,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
86802,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"


In [21]:
#Removing Non-US From Title Basics
keepers = basics['tconst'].isin(akas['titleId'])
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34804,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61117,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
67670,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
77965,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
86802,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9455321,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
9455330,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"
9455369,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020.0,,84,Thriller
9455414,tt9916362,movie,Coven,Akelarre,0,2020.0,,92,"Drama,History"


In [22]:
## The AKAs file has the information on country where it released and language
akas = akas[(akas['region'] == 'US')]
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0
...,...,...,...,...,...,...,...,...
34226969,tt9916702,1,Loving London: The Playground,US,,,,0
34227006,tt9916720,10,The Demonic Nun,US,,tv,,0
34227008,tt9916720,12,The Nun 2,US,,imdbDisplay,,0
34227026,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0


In [23]:
keepers = ratings['tconst'].isin(akas['titleId'])
ratings = ratings[keepers]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1928
1,tt0000002,5.8,261
4,tt0000005,6.2,2556
5,tt0000006,5.1,175
6,tt0000007,5.4,797
...,...,...,...
1258330,tt9916204,8.2,251
1258336,tt9916348,8.5,17
1258337,tt9916362,6.4,5062
1258341,tt9916428,3.8,14


Saving Final Files

In [24]:
## Saving and immediately loading (to verify)
basics.to_csv('Data/title_basics_cleaned.csv.gz',compression='gzip',index=False)
basics = pd.read_csv('Data/title_basics_cleaned.csv.gz')
basics.info()
basics.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136812 entries, 0 to 136811
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          136812 non-null  object 
 1   titleType       136812 non-null  object 
 2   primaryTitle    136812 non-null  object 
 3   originalTitle   136812 non-null  object 
 4   isAdult         136812 non-null  int64  
 5   startYear       136812 non-null  float64
 6   endYear         0 non-null       float64
 7   runtimeMinutes  136812 non-null  int64  
 8   genres          136812 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 9.4+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"


In [25]:
ratings.to_csv('Data/title_ratings_cleaned.csv.gz',compression='gzip',index=False)
ratings = pd.read_csv('Data/title_ratings_cleaned.csv.gz')
ratings.info()
ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479153 entries, 0 to 479152
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         479153 non-null  object 
 1   averageRating  479153 non-null  float64
 2   numVotes       479153 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 11.0+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1928
1,tt0000002,5.8,261
2,tt0000005,6.2,2556
3,tt0000006,5.1,175
4,tt0000007,5.4,797


In [26]:
akas.to_csv('Data/title_akas_cleaned.csv.gz',compression='gzip',index=False)
akas = pd.read_csv('Data/title_akas_cleaned.csv.gz')
akas.info()
akas.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1394680 entries, 0 to 1394679
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1394680 non-null  object 
 1   ordering         1394680 non-null  int64  
 2   title            1394680 non-null  object 
 3   region           1394680 non-null  object 
 4   language         3744 non-null     object 
 5   types            967716 non-null   object 
 6   attributes       45543 non-null    object 
 7   isOriginalTitle  1393335 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 85.1+ MB


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [27]:
# optional 
os.listdir("Data/")

['title_basics_cleaned.csv.gz',
 'title_ratings_cleaned.csv.gz',
 'tmdb_api_results_2000.json',
 'final_tmdb_data_2000.csv.gz',
 'tmdb_api_results_2001.json',
 'final_tmdb_data_2001.csv.gz',
 'title_akas_cleaned.csv.gz']