# IMD Project

## Imports and Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Basics

basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
1,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
2,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
3,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
4,tt0001285,movie,The Life of Moses,The Life of Moses,0,1909,,50,"Biography,Drama,Family"


In [None]:
# Basics

# basics = pd.read_csv('https://datasets.imdbws.com/title.basics.tsv.gz', sep='\t', low_memory=False)
# basics.head()

In [3]:
# Ratings


ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1965
1,tt0000002,5.8,263
2,tt0000003,6.5,1808
3,tt0000004,5.6,178
4,tt0000005,6.2,2607


In [None]:
# Ratings

# ratings = pd.read_csv('https://datasets.imdbws.com/title.ratings.tsv.gz', sep='\t', low_memory=False)
# ratings.head()

In [4]:
# Akas

akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [None]:
# Akas

# akas = pd.read_csv('https://datasets.imdbws.com/title.akas.tsv.gz', sep='\t', low_memory=False)
# akas.head()

## Filtering Data - Missing Values

In [5]:
# Replace \N values

basics = basics.replace({'\\N':np.nan})
ratings = ratings.replace({'\\N':np.nan})
akas = akas.replace({'\\N':np.nan})

In [6]:
# Exclude any movie with missing values for genre or runtime

print(f'There are {basics.isna().sum().sum()} missing values.')

print(basics.isna().sum())

There are 158200 missing values.
tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           158200
runtimeMinutes         0
genres                 0
dtype: int64


In [7]:
# Drop genres

basics.dropna(subset = 'genres', inplace = True)

In [8]:
# Drop runtimeMinutes

basics.dropna(subset = 'runtimeMinutes', inplace = True)

In [9]:
# Check values again

print(f'There are {basics.isna().sum().sum()} missing values.')

print(basics.isna().sum())

There are 158200 missing values.
tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           158200
runtimeMinutes         0
genres                 0
dtype: int64


## Filtering Data - Movie

In [10]:
# Include only full-length movies (titleType = "movie").

movie = basics['titleType'] == 'movie'
basics = basics[movie]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
1,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
2,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
3,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
4,tt0001285,movie,The Life of Moses,The Life of Moses,0,1909,,50,"Biography,Drama,Family"


## Filtering Data - Documentary

In [11]:
# Include only fictional movies (not from documentary genre)

is_documentary = basics['genres'].str.contains('documentary', case=False)
basics = basics[~is_documentary]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
1,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
2,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
3,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
4,tt0001285,movie,The Life of Moses,The Life of Moses,0,1909,,50,"Biography,Drama,Family"


## Filtering Data - StartYear 2000-2022

In [13]:
basics['startYear'].astype('int')
years = basics['startYear'] >= 2000
basics = basics[years]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
16161,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
28601,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
31485,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
38424,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
41036,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


## Filtering Data - US Only

In [14]:
# Create filter in akas DF

usa = akas['region'] == 'US'
akas = akas[usa]
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [15]:
# Filter the basics table down to only include the US by using the filter akas dataframe

keepers = basics['tconst'].isin(akas['titleId'])
keepers


16161     True
28601     True
31485     True
38424     True
41036     True
          ... 
158195    True
158196    True
158197    True
158198    True
158199    True
Name: tconst, Length: 88171, dtype: bool

In [16]:
# Created final DF

basics = basics[keepers]
basics.head()


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
16161,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
28601,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
31485,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
38424,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
41036,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [19]:
# Filter the basics table down to only include the US by using the filter akas dataframe

keepers2 = ratings['tconst'].isin(akas['titleId'])
keepers2

0           True
1           True
2          False
3          False
4           True
           ...  
1305608    False
1305609    False
1305610    False
1305611    False
1305612    False
Name: tconst, Length: 1305613, dtype: bool

In [20]:
# Created final DF

ratings = ratings[keepers2]
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1965
1,tt0000002,5.8,263
4,tt0000005,6.2,2607
5,tt0000006,5.2,181
6,tt0000007,5.4,816


## Creating New Folder & Files

In [17]:
# example making new folder with os

import os

os.makedirs('Data/',exist_ok=True)

# Confirm folder created

os.listdir("Data/")

['title_basics.csv.gz', 'title_akas.csv.gz', 'title_ratings.csv.gz']

In [21]:
# Save current dataframe to file.

basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)
