In [1]:
# make new folder with os
import os
os.makedirs('Data/',exist_ok=True)

In [2]:
# confirm folder was created and files added successfully
os.listdir("Data/")

['IMDB Movie Dataset Info.docx',
 'title-akas-us-only.csv',
 'title.basics.tsv.gz',
 'title.ratings.tsv.gz']

PREPROCESSING

1) Start by loading the title-akas-us-only.csv file. All non-us movies have been removed from this file.

In [24]:
import pandas as pd
import numpy as np

# Load the title-akas-us-only.csv file
akas = pd.read_csv("Data/title-akas-us-only.csv", low_memory=False)
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0
...,...,...,...,...,...,...,...,...
1452559,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,\N,imdbDisplay,\N,0
1452560,tt9916620,1,The Copeland Case,US,\N,imdbDisplay,\N,0
1452561,tt9916702,1,Loving London: The Playground,US,\N,\N,\N,0
1452562,tt9916756,1,Pretty Pretty Black Girl,US,\N,imdbDisplay,\N,0


2) Load the title basics file.

In [25]:
# Load the title.basics.tsv.gz file
basics = pd.read_csv("Data/title.basics.tsv.gz", sep='\t', low_memory=False)
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
10017006,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family"
10017007,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
10017008,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
10017009,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


3) Remove non-US movies from title basics

In [35]:
# Filter the basics table down to only include the US by using the filter akas dataframe
filter_us_titles = basics['tconst'].isin(akas['titleId'])
basics_us = basics[filter_us_titles]
basics_us

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10016872,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,\N,58,Family
10016901,tt9916620,movie,The Copeland Case,The Copeland Case,0,\N,\N,\N,Drama
10016939,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,\N,\N,\N,"Drama,Short"
10016962,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,\N,\N,Short


4) Perform the remaining preprocessing steps for title basics.

In [38]:
# Replace '\N' with np.nan
basics_us = basics_us.replace({'\\N':np.nan})

In [39]:
# Drop rows with null values in the runtimeMinutes or genres columns.
basics_us.dropna(subset=['runtimeMinutes', 'genres'], inplace=False)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10016704,tt9916214,short,Drown the Clown,Drown the Clown,0,2019,,8,"Drama,Short"
10016724,tt9916254,video,Big Tit Cream Pie 32,Big Tit Cream Pie 32,1,2015,,226,Adult
10016770,tt9916348,video,Ancient World Exposed,Ancient World Exposed,0,2019,,67,History
10016777,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"


In [40]:
# Filter to keep only full-length movies
basics_us = basics_us[basics_us['titleType'] == 'movie']

In [41]:
# Convert startYear to a float dtype
basics_us['startYear'] = basics_us['startYear'].astype(float)

In [42]:
# Filter to keep movies with startYears that are >=2000 and <=2022
basics_us = basics_us[(basics_us['startYear'] >= 2000) & (basics_us['startYear'] <= 2022)]

In [48]:
# Eliminate movies that include "Documentary" in genre
filter_documentaries = basics_us['genres'].str.contains('Documentary', na=False)
basics_us = basics_us[~filter_documentaries]

5) Display a final preview of your filtered title basics and save to a csv

In [49]:
# Display info and head of the filtered title basics DataFrame
basics_us.info()
basics_us.head()

<class 'pandas.core.frame.DataFrame'>
Index: 104636 entries, 34802 to 10016809
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          104636 non-null  object 
 1   titleType       104636 non-null  object 
 2   primaryTitle    104634 non-null  object 
 3   originalTitle   104634 non-null  object 
 4   isAdult         104636 non-null  object 
 5   startYear       104636 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  88546 non-null   object 
 8   genres          101501 non-null  object 
dtypes: float64(1), object(8)
memory usage: 8.0+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61114,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67666,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86793,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93930,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [56]:
# Save the data to a CSV file in the Data folder
basics_us.to_csv('Data/title_basics_filtered.csv', index=False)

6) Load and filter the title ratings file

In [57]:
# Load the title ratings data
ratings = pd.read_csv("Data/title.ratings.tsv.gz", sep='\t', low_memory=False)

In [58]:
# Keep only movies that are included in your final title basics dataframe
filter_basics = ratings['tconst'].isin(basics['tconst'])
ratings_filtered = ratings[filter_basics]

In [59]:
# check for '\N' value in averageRating column
ratings_filtered['averageRating'].unique()

array([ 5.7,  5.8,  6.5,  5.5,  6.2,  5.1,  5.4,  5.3,  6.9,  7.4,  7.1,
        5.9,  4.6,  4.8,  4.1,  3.8,  5.6,  5.2,  5. ,  4.9,  4.4,  4.3,
        4.2,  3.3,  4. ,  6.7,  3.9,  3.2,  3.4,  3.1,  4.7,  2.8,  6.3,
        3. ,  3.5,  6.4,  4.5,  3.7,  6. ,  3.6,  2.9,  6.1,  6.6,  2.6,
        7.3,  7.6,  6.8,  8.2,  7.5,  8.5,  7.2,  7.8,  8.4,  8.1,  2.4,
        7. ,  7.7,  8. ,  8.9,  8.6,  9. ,  9.1,  2.7,  8.7,  2.2,  1.4,
        2. ,  7.9,  1. ,  8.3,  1.6,  2.3,  1.1,  2.1,  1.5,  9.2,  1.8,
        1.3,  1.9,  2.5,  1.7,  9.4,  1.2,  9.7,  8.8,  9.3,  9.6,  9.5,
        9.9,  9.8, 10. ])

In [60]:
# check for '\N' value in numVotes column
ratings_filtered['numVotes'].unique()

array([  1988,    265,   1849, ...,  57971,  14210, 103412], dtype=int64)

7) Display a final preview of your filtered title ratings and save to a csv

In [61]:
# Display info and head of the filtered title basics DataFrame
ratings_filtered.info()
ratings_filtered.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1331492 entries, 0 to 1331491
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1331492 non-null  object 
 1   averageRating  1331492 non-null  float64
 2   numVotes       1331492 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.5+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
2,tt0000003,6.5,1849
3,tt0000004,5.5,178
4,tt0000005,6.2,2632


In [62]:
# Save the data to a CSV file in the Data folder
ratings_filtered.to_csv('Data/title_ratings_filtered.csv', index=False)