In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# importing the data from 'title.basics.tsv.gz'
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
# importing the data from 'title.akas.tsv.gz'
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'
# importing the data from 'title.ratings.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

In [3]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [4]:
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

In [5]:
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

## Handling all the \N placeholders into a np.nan

In [6]:
basics.head()
basics.replace({'\\N':np.nan})

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
9842459,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,,,"Action,Drama,Family"
9842460,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,,,"Action,Drama,Family"
9842461,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,,,"Action,Drama,Family"
9842462,tt9916856,short,The Wind,The Wind,0,2015,,27,Short


In [7]:
akas.head()
akas.replace({'\\N':np.nan})

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0
...,...,...,...,...,...,...,...,...
35863468,tt9916852,5,Episódio #3.20,PT,pt,,,0
35863469,tt9916852,6,Episodio #3.20,IT,it,,,0
35863470,tt9916852,7,एपिसोड #3.20,IN,hi,,,0
35863471,tt9916856,1,The Wind,DE,,imdbDisplay,,0


In [8]:
ratings.head()
ratings.replace({'\\N':np.nan})

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1971
1,tt0000002,5.8,263
2,tt0000003,6.5,1816
3,tt0000004,5.6,178
4,tt0000005,6.2,2612
...,...,...,...
1309290,tt9916730,8.3,10
1309291,tt9916766,7.0,21
1309292,tt9916778,7.2,36
1309293,tt9916840,8.8,6


## Filtering/Cleaning the datasets

In [15]:
# Drop rows with null values in the 'runtimeMinutes' column
basics = basics.dropna(subset=['runtimeMinutes'])
basics = basics.dropna(subset=['genres'])
basics.isnull().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres             0
dtype: int64

In [45]:
# keeping only full-length movies 
imdb_titles = basics[(basics['titleType'] == 'movie')]
imdb_titles

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport"
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
...,...,...,...,...,...,...,...,...,...
9842355,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,\N,57,Documentary
9842382,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,\N,100,Documentary
9842394,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013,\N,\N,Comedy
9842404,tt9916730,movie,6 Gunn,6 Gunn,0,2017,\N,116,\N


In [46]:
# checking the data types of the data
imdb_titles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 644672 entries, 8 to 9842414
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          644672 non-null  object
 1   titleType       644672 non-null  object
 2   primaryTitle    644672 non-null  object
 3   originalTitle   644672 non-null  object
 4   isAdult         644672 non-null  object
 5   startYear       644672 non-null  object
 6   endYear         644672 non-null  object
 7   runtimeMinutes  644672 non-null  object
 8   genres          644672 non-null  object
dtypes: object(9)
memory usage: 49.2+ MB


In [47]:
imdb_titles = imdb_titles.replace({'\\N':np.nan})

In [43]:
imdb_titles.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
498,tt0000502,movie,Bohemios,Bohemios,0,1905,,100,
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama


In [48]:
imdb_titles['startYear'] = imdb_titles['startYear'].astype(float)
imdb_titles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 644672 entries, 8 to 9842414
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          644672 non-null  object 
 1   titleType       644672 non-null  object 
 2   primaryTitle    644672 non-null  object 
 3   originalTitle   644672 non-null  object 
 4   isAdult         644672 non-null  object 
 5   startYear       554049 non-null  float64
 6   endYear         0 non-null       float64
 7   runtimeMinutes  406477 non-null  object 
 8   genres          572685 non-null  object 
dtypes: float64(2), object(7)
memory usage: 49.2+ MB


In [60]:
imdb_titles[(imdb_titles['startYear'] >= 2000) & (imdb_titles['startYear'] <= 2021)]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
11636,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019.0,,,"Action,Crime"
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021.0,,94,Documentary
15178,tt0015414,movie,La tierra de los toros,La tierra de los toros,0,2000.0,,60,
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
...,...,...,...,...,...,...,...,...,...
9842355,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015.0,,57,Documentary
9842382,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007.0,,100,Documentary
9842394,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013.0,,,Comedy
9842404,tt9916730,movie,6 Gunn,6 Gunn,0,2017.0,,116,


In [61]:
# Exclude movies that are included in the documentary category.
is_documentary = imdb_titles['genres'].str.contains('documentary',case=False, na=False)
imdb_titles = imdb_titles[~is_documentary]

In [62]:
imdb_titles

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894.0,,45,Romance
498,tt0000502,movie,Bohemios,Bohemios,0,1905.0,,100,
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906.0,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907.0,,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907.0,,,Drama
...,...,...,...,...,...,...,...,...,...
9842262,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,0,2019.0,,,"Adventure,History,War"
9842314,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019.0,,123,Drama
9842354,tt9916620,movie,The Copeland Case,The Copeland Case,0,,,,Drama
9842394,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013.0,,,Comedy


In [63]:
# Filter the DataFrame to include only movies that were released in the United States
USA_movies = akas[(akas['region'] == 'US')]
USA_movies

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
14,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
36,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
41,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0
...,...,...,...,...,...,...,...,...
35862999,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,\N,imdbDisplay,\N,0
35863069,tt9916620,1,The Copeland Case,US,\N,imdbDisplay,\N,0
35863158,tt9916702,1,Loving London: The Playground,US,\N,\N,\N,0
35863201,tt9916756,1,Pretty Pretty Black Girl,US,\N,imdbDisplay,\N,0


In [64]:
# filtering the basics table to only include US made movies. 
keepers =imdb_titles['tconst'].isin(USA_movies['titleId'])

In [65]:
# new dataframe with the filtered features
imdb_titles = imdb_titles[keepers]
imdb_titles

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894.0,,45,Romance
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906.0,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907.0,,90,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908.0,,,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908.0,,120,"Adventure,Fantasy"
...,...,...,...,...,...,...,...,...,...
9842145,tt9916188,movie,Minotaur,Minotaur,0,,,,Thriller
9842146,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"
9842230,tt9916362,movie,Coven,Akelarre,0,2020.0,,92,"Drama,History"
9842262,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,0,2019.0,,,"Adventure,History,War"


## Filtering/Cleaning the 'title.ratings.tsv.gz' dataset

In [66]:
# filtering the ratings table to only include US movies 
ratings_us =ratings['tconst'].isin(USA_movies['titleId'])
ratings_us

ratings = ratings[ratings_us]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1971
1,tt0000002,5.8,263
4,tt0000005,6.2,2612
5,tt0000006,5.1,181
6,tt0000007,5.4,818
...,...,...,...
1309256,tt9916200,8.1,229
1309257,tt9916204,8.1,262
1309264,tt9916348,8.3,18
1309265,tt9916362,6.4,5322


In [67]:
imdb_titles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 245272 entries, 8 to 9842354
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          245272 non-null  object 
 1   titleType       245272 non-null  object 
 2   primaryTitle    245272 non-null  object 
 3   originalTitle   245272 non-null  object 
 4   isAdult         245272 non-null  object 
 5   startYear       191466 non-null  float64
 6   endYear         0 non-null       float64
 7   runtimeMinutes  164950 non-null  object 
 8   genres          234626 non-null  object 
dtypes: float64(2), object(7)
memory usage: 18.7+ MB


In [68]:
USA_movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1435910 entries, 5 to 35863217
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1435910 non-null  object
 1   ordering         1435910 non-null  int64 
 2   title            1435910 non-null  object
 3   region           1435910 non-null  object
 4   language         1435910 non-null  object
 5   types            1435910 non-null  object
 6   attributes       1435910 non-null  object
 7   isOriginalTitle  1435910 non-null  object
dtypes: int64(1), object(7)
memory usage: 98.6+ MB


In [69]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 498013 entries, 0 to 1309270
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         498013 non-null  object 
 1   averageRating  498013 non-null  float64
 2   numVotes       498013 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.2+ MB


## Saving our files

In [None]:
imdb_titles.head()

In [70]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [None]:
## Save current dataframe to file.
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

# Open saved file and preview again
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()

In [None]:
## Save current dataframe to file.
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

# Open saved file and preview again
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings.head()