In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
title_basic_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
title_aka_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
title_rating_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

In [3]:
basics = pd.read_csv(title_basic_url, sep='\t', low_memory=False)

In [4]:
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
8899038,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,\N,\N,"Action,Drama,Family"
8899039,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
8899040,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
8899041,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [5]:
basics_df = basics.replace({'\\N':np.nan})

In [6]:
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8899043 entries, 0 to 8899042
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 611.0+ MB


In [7]:
basics_df = basics_df.dropna(subset = ['runtimeMinutes', 'genres'])

In [8]:
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2330718 entries, 0 to 8899042
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 177.8+ MB


In [9]:
movie_filter = basics_df['titleType'] == 'movie'


In [10]:
basics_df = basics_df[movie_filter]

In [11]:
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
1172,tt0001184,movie,Don Juan de Serrallonga,Don Juan de Serrallonga,0,1910,,58,"Adventure,Drama"
1273,tt0001285,movie,The Life of Moses,The Life of Moses,0,1909,,50,"Biography,Drama,Family"


In [12]:
import os
os.makedirs('Data/', exist_ok=True)
os.listdir("Data/")

['title_basics.csv.gz']

In [13]:
basics_df.to_csv("Data/title_basics.csv.gz", compression='gzip',index=False)

In [14]:
del basics_df

In [15]:
aka = pd.read_csv(title_aka_url, sep='\t', low_memory=False)

In [16]:
aka

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0
...,...,...,...,...,...,...,...,...
31866469,tt9916852,5,Episódio #3.20,PT,pt,\N,\N,0
31866470,tt9916852,6,Episodio #3.20,IT,it,\N,\N,0
31866471,tt9916852,7,एपिसोड #3.20,IN,hi,\N,\N,0
31866472,tt9916856,1,The Wind,DE,\N,imdbDisplay,\N,0


In [17]:
aka_df = aka.replace({'\\N':np.nan})

In [18]:
is_US = aka_df['region'] == 'US'

In [19]:
aka_df = aka_df[is_US]

In [20]:
aka_df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [21]:
aka_df.to_csv("Data/aka.csv.gz", compression='gzip',index=False)

In [22]:
del aka_df

In [23]:
rating = pd.read_csv(title_rating_url, sep='\t', low_memory=False)

In [24]:
rating_df = rating.replace({'\\N':np.nan})

In [25]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1241511 entries, 0 to 1241510
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1241511 non-null  object 
 1   averageRating  1241511 non-null  float64
 2   numVotes       1241511 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.4+ MB


In [26]:
rating_df.to_csv("Data/rating.csv.gz", compression='gzip',index=False)