# Shelly-Ann Duncan
## 11/10/22
## Project 3 - Part 1

# Import necessary libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load data

In [2]:
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
basics = pd.read_csv(basics_url, sep = '\t', low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


# Clean and explore data


In [3]:
# make a copy of the data
basics_ml = basics.copy()

In [4]:
# get the datatypes
basics_ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9365430 entries, 0 to 9365429
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 643.1+ MB


In [5]:
# check for missing data
basics_ml.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

In [6]:
# check for duplicates 
basics_ml.duplicated().sum()

0

In [7]:
# replace \N with np.nan for missing data
basics_ml.replace({'\\N': 'np.nan'}, inplace = True)

In [23]:
# eliminate movies that are null for runtimeMinutes and genres
basics_ml.dropna(subset = ['runtimeMinutes', 'genres'], inplace = True)

In [25]:
# eliminate movies that include "Documentary" in genres
is_documentary = basics_ml['genres'].str.contains('Documentary', case = False)
basics_ml = basics_ml[~is_documentary]

In [29]:
# keep only titleType==Movie
is_movie = basics_ml['titleType'].str.contains('movie', case = False)
basics_ml = basics_ml[is_movie]

In [54]:
# convert to numeric for slicing
basics_ml['startYear'] = basics_ml['startYear'].astype(float).copy()

In [57]:
# keep startYear 2000-2022
basics_ml = basics_ml[(basics_ml['startYear'] >= 2000) & 
                      (basics_ml['startYear']<2022)]

In [50]:
# keep only US movies
keepers = basics_ml['tconst'].isin(akas_ml['titleId'])
keepers

Series([], Name: tconst, dtype: bool)

In [58]:
# filter
basics_ml = basics_ml[keepers]

In [64]:
basics_ml.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          0 non-null      object 
 1   titleType       0 non-null      object 
 2   primaryTitle    0 non-null      object 
 3   originalTitle   0 non-null      object 
 4   isAdult         0 non-null      object 
 5   startYear       0 non-null      float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  0 non-null      object 
 8   genres          0 non-null      object 
dtypes: float64(1), object(8)
memory usage: 0.0+ bytes


# Load data 2

In [9]:
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
ratings = pd.read_csv(ratings_url, sep = '\t', low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1922
1,tt0000002,5.8,259
2,tt0000003,6.5,1734
3,tt0000004,5.6,174
4,tt0000005,6.2,2546


# Clean and explore data

In [10]:
# make a copy of the data
ratings_ml = ratings.copy()

In [11]:
# get the datatypes
ratings_ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1246688 entries, 0 to 1246687
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1246688 non-null  object 
 1   averageRating  1246688 non-null  float64
 2   numVotes       1246688 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.5+ MB


In [12]:
# check for missing data
ratings_ml.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [13]:
# check for duplicates
ratings_ml.duplicated().sum()

0

In [47]:
# Keep only US movies 
keep = basics_ml['tconst'].isin(akas_ml['titleId'])
keep

Series([], Name: tconst, dtype: bool)

In [59]:
# filter
basics_ml = basics_ml[keep]

In [62]:
ratings_ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1246688 entries, 0 to 1246687
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1246688 non-null  object 
 1   averageRating  1246688 non-null  float64
 2   numVotes       1246688 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.5+ MB


# Load data 3

In [16]:
# laod data 3
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'
akas = pd.read_csv(akas_url, sep = '\t', low_memory = False,
                   chunksize = 100_000)
akas

<pandas.io.parsers.readers.TextFileReader at 0x28f003fbfd0>

In [17]:
# the first row # of the next chunk is stored under ._currow
akas._currow

0

In [18]:
# use the .get_chunk() method to extract the first chunk of rows
akas = akas.get_chunk()
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0
...,...,...,...,...,...,...,...,...
99995,tt0022309,1,Audácia,BR,\N,imdbDisplay,\N,0
99996,tt0022309,2,Trumbull & son,NO,\N,imdbDisplay,\N,0
99997,tt0022309,3,Audácia que Assombra,PT,\N,imdbDisplay,\N,0
99998,tt0022309,4,Rich Man's Folly,\N,\N,original,\N,1


In [19]:
# make a copy of the data
akas_ml = akas.copy()

In [20]:
# check the dataypes
akas_ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   titleId          100000 non-null  object
 1   ordering         100000 non-null  int64 
 2   title            100000 non-null  object
 3   region           100000 non-null  object
 4   language         100000 non-null  object
 5   types            100000 non-null  object
 6   attributes       100000 non-null  object
 7   isOriginalTitle  100000 non-null  object
dtypes: int64(1), object(7)
memory usage: 6.1+ MB


In [21]:
# check for missing data
akas_ml.isna().sum()

titleId            0
ordering           0
title              0
region             0
language           0
types              0
attributes         0
isOriginalTitle    0
dtype: int64

In [22]:
#check for duplicates
akas_ml.duplicated().sum()

0

In [60]:
# replace \N with np.nan for missing data
akas_ml.replace({'\\N': 'np.nan'}, inplace = True)

In [65]:
akas_ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   titleId          100000 non-null  object
 1   ordering         100000 non-null  int64 
 2   title            100000 non-null  object
 3   region           100000 non-null  object
 4   language         100000 non-null  object
 5   types            100000 non-null  object
 6   attributes       100000 non-null  object
 7   isOriginalTitle  100000 non-null  object
dtypes: int64(1), object(7)
memory usage: 6.1+ MB


#  Save all dataframes to data folder

In [66]:
# Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz", compression = 'gzip', index = False)

In [67]:
# Save current dataframe to file.
ratings.to_csv("Data/title_ratings.csv.gz", compression = 'gzip', 
               index = False)

In [68]:
akas.to_csv("Data/title_akas.csv.gz", compression = 'gzip', index = False)