### Rose Tovar
### Movie Analysis Project
### RoseATovar@gmail.com

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
#urls
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'

In [3]:
# creating dataframes
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [4]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9223790 entries, 0 to 9223789
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 633.3+ MB


In [5]:
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1910
1,tt0000002,5.8,256
2,tt0000003,6.5,1713
3,tt0000004,5.6,169
4,tt0000005,6.2,2527


In [6]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1260427 entries, 0 to 1260426
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1260427 non-null  object 
 1   averageRating  1260427 non-null  float64
 2   numVotes       1260427 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.8+ MB


In [7]:
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [8]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33183512 entries, 0 to 33183511
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.0+ GB


### Specifications

In [9]:
# excluding movies with missing values for genre or runtime

basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

In [10]:
# removing rows with missing genres and runtimeminutes
basics = basics.dropna(subset=['genres','runtimeMinutes'])
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres             0
dtype: int64

In [11]:
basics.replace('\\N', np.nan, inplace=True)

In [12]:
# including only full movies
basics = basics.loc[basics['titleType'] == 'movie']
basics['titleType'].value_counts()

movie    621118
Name: titleType, dtype: int64

In [13]:
# Removing Documnetary 
basics = basics[basics["genres"].str.contains("Documentary", case=False) == False]
basics['genres'].value_counts()

Drama                         115283
Comedy                         44269
Horror                         15461
Action                         14189
Thriller                       13910
                               ...  
Adult,Drama,Reality-TV             1
Action,Adult,Sci-Fi                1
Fantasy,Reality-TV,Romance         1
Action,Biography,Family            1
News,Sport,Talk-Show               1
Name: genres, Length: 1233, dtype: int64

In [14]:
# removing years before 2000 and after 2021
basics['startYear'].value_counts()

2018    12214
2019    11826
2017    11762
2021    11666
2016    11331
        ...  
1905        3
1906        3
1908        3
1903        2
1894        1
Name: startYear, Length: 127, dtype: int64

In [15]:
#dropping nana
basics.dropna(subset=['startYear'],inplace=True)
basics['startYear'].isna().sum()

0

In [16]:
#converting to int
basics['startYear'] = basics['startYear'].astype(int)

In [17]:
#only having movies from 2000 to 2021
basics = basics.loc[(basics['startYear'] >= 2000) & (basics['startYear'] <= 2021)]

In [18]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
11636,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019,,,"Action,Crime"
34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118.0,"Comedy,Fantasy,Romance"
61093,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70.0,Drama
67639,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122.0,Drama
77933,tt0079644,movie,November 1828,November 1828,0,2001,,140.0,"Drama,War"


In [19]:
#replace \N with np.nan in akas and ratings
akas.replace('\\N', np.nan, inplace=True)
ratings.replace('\\N', np.nan, inplace=True)

In [20]:
#Initial Cleaning of Akas
akas = akas.loc[akas['titleId'].isin(basics['tconst']) == True]
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
38960,tt0011801,1,Tötet nicht mehr,DE,,imdbDisplay,,0
38961,tt0011801,2,Misericordia - Tötet nicht mehr!,DE,,,censored version,0
38962,tt0011801,3,Tötet nicht mehr,,,original,,1
195445,tt0035423,10,Kate et Léopold,FR,,imdbDisplay,,0
195446,tt0035423,11,Kate & Leopold,ES,,imdbDisplay,,0


In [21]:
# Only movies from United States
akas = akas.loc[akas['region'] == 'US']

In [22]:
# Updating other dataframes to contain only the same values

basics = basics[basics['tconst'].isin(akas['titleId'])]

In [23]:
ratings = ratings[ratings['tconst'].isin(basics['tconst'])]

### Looking at Each Dataframe

In [24]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
17874,tt0035423,6.4,84671
40648,tt0062336,6.4,161
46481,tt0069049,6.7,7358
63439,tt0088751,5.2,325
69746,tt0096056,5.6,821


In [25]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61093,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67639,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86770,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
93906,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [26]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
195472,tt0035423,35,Kate and Leopold,US,,,alternative spelling,0
195474,tt0035423,37,Kate & Leopold,US,,imdbDisplay,,0
458989,tt0062336,5,The Tango of the Widower and Its Distorting Mi...,US,,imdbDisplay,,0
527057,tt0069049,3,The Other Side of the Wind,US,,imdbDisplay,,0
707729,tt0088751,1,Attack of the B-Movie Monster,US,,working,,0


In [27]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69978 entries, 17874 to 1260409
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         69978 non-null  object 
 1   averageRating  69978 non-null  float64
 2   numVotes       69978 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 2.1+ MB


In [28]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92040 entries, 34792 to 9223587
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          92040 non-null  object
 1   titleType       92040 non-null  object
 2   primaryTitle    92040 non-null  object
 3   originalTitle   92040 non-null  object
 4   isAdult         92040 non-null  object
 5   startYear       92040 non-null  int64 
 6   endYear         0 non-null      object
 7   runtimeMinutes  79656 non-null  object
 8   genres          92040 non-null  object
dtypes: int64(1), object(8)
memory usage: 7.0+ MB


In [29]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101740 entries, 195472 to 33182848
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   titleId          101740 non-null  object
 1   ordering         101740 non-null  int64 
 2   title            101740 non-null  object
 3   region           101740 non-null  object
 4   language         907 non-null     object
 5   types            94600 non-null   object
 6   attributes       4357 non-null    object
 7   isOriginalTitle  101740 non-null  object
dtypes: int64(1), object(7)
memory usage: 7.0+ MB


In [30]:
os.makedirs('Data/', exist_ok=True)
os.listdir('Data/')

[]

In [31]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
## Save current dataframe to file.
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

## Save current dataframe to file.
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [32]:
# Open saved file and preview again
basics = pd.read_csv("./Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118.0,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70.0,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122.0,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100.0,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126.0,Drama


In [33]:
# Open saved file and preview again
ratings = pd.read_csv("./Data/title_ratings.csv.gz", low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0035423,6.4,84671
1,tt0062336,6.4,161
2,tt0069049,6.7,7358
3,tt0088751,5.2,325
4,tt0096056,5.6,821


In [34]:
# Open saved file and preview again
akas = pd.read_csv("./Data/title_akas.csv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0035423,35,Kate and Leopold,US,,,alternative spelling,0
1,tt0035423,37,Kate & Leopold,US,,imdbDisplay,,0
2,tt0062336,5,The Tango of the Widower and Its Distorting Mi...,US,,imdbDisplay,,0
3,tt0069049,3,The Other Side of the Wind,US,,imdbDisplay,,0
4,tt0088751,1,Attack of the B-Movie Monster,US,,working,,0
