In [1]:
import pandas as pd
import numpy as np
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
title_akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'
title_ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
title_aka = pd.read_csv(title_akas_url, sep='\t', low_memory=False)
title_ratings = pd.read_csv(title_ratings_url, sep='\t', low_memory=False)

In [2]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [3]:
basics.shape

(9823612, 9)

In [4]:
title_aka.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [5]:
title_aka.shape

(35771906, 8)

In [6]:
title_ratings.shape

(1307213, 3)

In [7]:
basics = basics.replace({'\\N':np.nan})
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [8]:
title_aka = title_aka.replace({'\\N':np.nan})
title_aka.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [9]:
title_ratings = title_ratings.replace({'\\N':np.nan})
title_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1966
1,tt0000002,5.8,264
2,tt0000003,6.5,1811
3,tt0000004,5.6,178
4,tt0000005,6.2,2609


## Preprocessing
1. Exclude any movie with missing values for genre or runtime
2. Include only full-length movies (titleType = "movie").
3. Include only fictional movies (not from documentary genre)
4. Include only movies that were released 2000 - 2021 (include 2000 and 2021)
5. Include only movies that were released in the United States

1. Exclude any movie with missing values for genre or runtime

In [10]:
#missing genre
basics.dropna(subset = ['genres'], inplace = True)
basics.shape

(9380732, 9)

In [11]:
#missing runtime
basics.dropna(subset = ['runtimeMinutes'], inplace = True)
basics.shape

(2818840, 9)

2. Include only full-length movies (titleType = "movie").

In [12]:
basics = basics.loc[basics['titleType'] == 'movie']
basics.shape

(381794, 9)

In [13]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"


3. Include only fictional movies (not from documentary genre)

In [14]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
bascs = basics[~is_documentary]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"


4. Include only movies that were released 2000 - 2021 (include 2000 and 2021)

In [15]:
basics["startYear"] = pd.to_numeric(basics["startYear"], downcast="float")
print(basics.dtypes)

tconst             object
titleType          object
primaryTitle       object
originalTitle      object
isAdult            object
startYear         float32
endYear            object
runtimeMinutes     object
genres             object
dtype: object


In [16]:
#turn start year to a float before filtering out 
basics = basics[(basics['startYear']>=2000)&(basics['startYear']<2022)]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021.0,,94,Documentary
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
77964,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"


5. Include only movies that were released in the United States

In [17]:
aka_us_filter = title_aka['region']== 'US'
title_aka[aka_us_filter]

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0
...,...,...,...,...,...,...,...,...
35771432,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,,imdbDisplay,,0
35771502,tt9916620,1,The Copeland Case,US,,imdbDisplay,,0
35771591,tt9916702,1,Loving London: The Playground,US,,,,0
35771634,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0


In [18]:
title_aka = title_aka[aka_us_filter]
title_aka.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [19]:
basics['tconst'].isin(title_aka['titleId'])

13082      False
34803       True
61116       True
67669       True
77964      False
           ...  
9823378     True
9823462    False
9823503    False
9823530    False
9823562    False
Name: tconst, Length: 210409, dtype: bool

In [20]:
title_ratings['tconst'].isin(title_aka['titleId'])

0           True
1           True
2          False
3          False
4           True
           ...  
1307208    False
1307209    False
1307210    False
1307211    False
1307212    False
Name: tconst, Length: 1307213, dtype: bool

In [21]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [22]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021.0,,94,Documentary
1,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
2,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
4,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"


In [23]:
title_aka.to_csv("Data/title_aka.csv.gz",compression='gzip',index=False)

In [24]:
title_aka = pd.read_csv("Data/title_aka.csv.gz", low_memory = False)
title_aka.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [25]:
title_ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [26]:
title_ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
title_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1966
1,tt0000002,5.8,264
2,tt0000003,6.5,1811
3,tt0000004,5.6,178
4,tt0000005,6.2,2609
