### Rose Tovar
### Movie Analysis Project
### RoseATovar@gmail.com

In [1]:
import pandas as pd
import numpy as np

In [2]:
#urls
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'

In [3]:
# creating dataframes
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [4]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9223790 entries, 0 to 9223789
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 633.3+ MB


In [5]:
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1910
1,tt0000002,5.8,256
2,tt0000003,6.5,1713
3,tt0000004,5.6,169
4,tt0000005,6.2,2527


In [6]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1260427 entries, 0 to 1260426
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1260427 non-null  object 
 1   averageRating  1260427 non-null  float64
 2   numVotes       1260427 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.8+ MB


In [7]:
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [8]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33183512 entries, 0 to 33183511
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.0+ GB


### Specifications

In [9]:
# excluding movies with missing values for genre or runtime

basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

In [10]:
# removing rows with missing genres and runtimeminutes
basics = basics.dropna(subset=['genres','runtimeMinutes'])
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres             0
dtype: int64

In [11]:
# including only full movies
basics = basics.loc[basics['titleType'] == 'movie']
basics['titleType'].value_counts()

movie    621118
Name: titleType, dtype: int64

In [12]:
# Removing Documnetary 
basics = basics[basics["genres"].str.contains("Documentary")==False]
basics['genres'].value_counts()

Drama                         115283
\N                             71740
Comedy                         44269
Horror                         15461
Action                         14189
                               ...  
Adult,Drama,Reality-TV             1
Action,Adult,Sci-Fi                1
Fantasy,Reality-TV,Romance         1
Action,Biography,Family            1
News,Sport,Talk-Show               1
Name: genres, Length: 1234, dtype: int64

In [13]:
# removing years before 2000 and after 2021
basics['startYear'].value_counts()

\N      75710
2018    12987
2017    12617
2019    12602
2021    12466
        ...  
1906        8
2028        7
2029        4
1903        2
1894        1
Name: startYear, Length: 128, dtype: int64

In [14]:
# Converting \\N to nan
basics['startYear'].replace('\\N', np.nan, inplace=True)

In [15]:
#dropping nana
basics.dropna(subset=['startYear'],inplace=True)
basics['startYear'].isna().sum()

0

In [16]:
#converting to int
basics['startYear'] = basics['startYear'].astype(int)

In [17]:
#only having movies from 2000 to 2021
basics = basics.loc[(basics['startYear'] >= 2000) & (basics['startYear'] <= 2021)]

In [18]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
11636,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019,\N,\N,"Action,Crime"
15174,tt0015414,movie,La tierra de los toros,La tierra de los toros,0,2000,\N,60,\N
34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,\N,118,"Comedy,Fantasy,Romance"
61093,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,\N,70,Drama
67639,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,\N,122,Drama


In [19]:
#Initial Cleaning of Akas
akas = akas.loc[akas['titleId'].isin(basics['tconst']) == True]
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
38960,tt0011801,1,Tötet nicht mehr,DE,\N,imdbDisplay,\N,0
38961,tt0011801,2,Misericordia - Tötet nicht mehr!,DE,\N,\N,censored version,0
38962,tt0011801,3,Tötet nicht mehr,\N,\N,original,\N,1
55871,tt0015414,1,La tierra de los toros,ES,\N,imdbDisplay,\N,0
55872,tt0015414,2,La tierra de los toros,\N,\N,original,\N,1


In [20]:
# Only movies from United States
akas = akas.loc[akas['region'] == 'US']

In [21]:
# Updating other dataframes to contain only the same values

basics = basics.loc[basics['tconst'].isin(akas['titleId']) == True]

In [22]:
ratings = ratings.loc[ratings['tconst'].isin(basics['tconst']) == True]

### Looking at Each Dataframe

In [23]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
17874,tt0035423,6.4,84671
40648,tt0062336,6.4,161
46481,tt0069049,6.7,7358
63439,tt0088751,5.2,325
69746,tt0096056,5.6,821


In [24]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,\N,118,"Comedy,Fantasy,Romance"
61093,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,\N,70,Drama
67639,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,\N,122,Drama
86770,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,\N,100,"Comedy,Horror,Sci-Fi"
93906,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,\N,126,Drama


In [25]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
195472,tt0035423,35,Kate and Leopold,US,\N,\N,alternative spelling,0
195474,tt0035423,37,Kate & Leopold,US,\N,imdbDisplay,\N,0
458989,tt0062336,5,The Tango of the Widower and Its Distorting Mi...,US,\N,imdbDisplay,\N,0
527057,tt0069049,3,The Other Side of the Wind,US,\N,imdbDisplay,\N,0
707729,tt0088751,1,Attack of the B-Movie Monster,US,\N,working,\N,0


In [26]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70871 entries, 17874 to 1260409
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         70871 non-null  object 
 1   averageRating  70871 non-null  float64
 2   numVotes       70871 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 2.2+ MB


In [27]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94858 entries, 34792 to 9223587
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          94858 non-null  object
 1   titleType       94858 non-null  object
 2   primaryTitle    94858 non-null  object
 3   originalTitle   94858 non-null  object
 4   isAdult         94858 non-null  object
 5   startYear       94858 non-null  int64 
 6   endYear         94858 non-null  object
 7   runtimeMinutes  94858 non-null  object
 8   genres          94858 non-null  object
dtypes: int64(1), object(8)
memory usage: 7.2+ MB


In [28]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 104657 entries, 195472 to 33182848
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   titleId          104657 non-null  object
 1   ordering         104657 non-null  int64 
 2   title            104657 non-null  object
 3   region           104657 non-null  object
 4   language         104657 non-null  object
 5   types            104657 non-null  object
 6   attributes       104657 non-null  object
 7   isOriginalTitle  104657 non-null  object
dtypes: int64(1), object(7)
memory usage: 7.2+ MB


In [29]:
#saving dataframe
basics.to_csv('./Data/basics.csv')
ratings.to_csv('./Data/ratings.csv')
akas.to_csv('./Data/akas.csv')