# Movies dataset
# OSEMN methodology, Step 2:
# Scrub (clean) data obtained from IMDb
For detailed description, refer to $\texttt{methodology/2.scrub/}$
## Cleanup plan
1. Drop records that violate referential integrity between tables.

## Import dependencies

In [1]:
import pandas as pd
from time import time
import os

In [3]:
data_dir = '../../data/imdb/'
tsv_dir = data_dir + 'unpacked/'
os.listdir(tsv_dir)

['title.basics.tsv',
 'title.ratings.tsv',
 'name.basics.tsv',
 'title.crew.tsv',
 'title.principals.tsv',
 'title.akas.tsv',
 'title.episode.tsv']

In [4]:
ri_dir = data_dir + 'ref_integrity/'
os.listdir(ri_dir)

['title.akas_ri.csv',
 'title.episode_ri.csv',
 'title.principals_ri.csv',
 'title.crew_ri.csv',
 'movies_public_title.basics.csv',
 'movies_public_name.basics.csv']

## Referential integrity
### $\texttt{title.basics}$ (PK: $\texttt{tconst}$) and $\texttt{title.akas}$ (FK: $\texttt{titleId}$)
#### Read tables

In [None]:
t = time()
file_name = 'movies_public_title.basics.csv'
df1 = pd.read_csv(ri_dir + file_name, delimiter=',',
                 na_values='\\N')
elapsed = time() - t
print("----- DataFrame loaded"
      "\nin {0:.2f} seconds".format(elapsed) + 
      "\nwith {0:,} rows\nand {1:,} columns"
      .format(df1.shape[0], df1.shape[1]) + 
      "\n-- Column names:\n", df1.columns)

In [None]:
t = time()
file_name = 'title.akas.tsv'
df2 = pd.read_csv(tsv_dir + file_name, delimiter='\t',
                 na_values='\\N')
elapsed = time() - t
print("----- DataFrame loaded"
      "\nin {0:.2f} seconds".format(elapsed) + 
      "\nwith {0:,} rows\nand {1:,} columns"
      .format(df2.shape[0], df2.shape[1]) + 
      "\n-- Column names:\n", df2.columns)

#### Drop records that violate referential integrity

In [None]:
pk = 'tconst'
fk = 'titleId'
mask = df2[fk].isin(df1[pk])
old_len = len(df2)
df2 = df2.drop(df2[~mask].index)
print("Values of df2 not present in the Primary Key of df1 were dropped!"
      "\n{0:,} rows out of {1:,} were removed. {2:,} rows remaining."
      .format(old_len - len(df2),
              old_len,
              len(df2)))

#### Save results to file

In [None]:
save_path = data_dir + 'ref_integrity/' + file_name[:-4] + '_ri.csv'
t = time()
df2.to_csv(save_path, index=False)
elapsed = time() - t
print("DataFrame saved to file:\n", save_path,
      "\ntook {0:.2f} seconds".format(elapsed))

### $\texttt{title.basics}$ (PK: $\texttt{tconst}$) and $\texttt{title.episode}$ (FK: $\texttt{parentTconst}$)
#### Read tables

In [None]:
t = time()
file_name = 'movies_public_title.basics.csv'
df1 = pd.read_csv(ri_dir + file_name, delimiter=',',
                 na_values='\\N')
elapsed = time() - t
print("----- DataFrame loaded"
      "\nin {0:.2f} seconds".format(elapsed) + 
      "\nwith {0:,} rows\nand {1:,} columns"
      .format(df1.shape[0], df1.shape[1]) + 
      "\n-- Column names:\n", df1.columns)

In [None]:
t = time()
file_name = 'title.episode.tsv'
df2 = pd.read_csv(tsv_dir + file_name, delimiter='\t',
                 na_values='\\N')
elapsed = time() - t
print("----- DataFrame loaded"
      "\nin {0:.2f} seconds".format(elapsed) + 
      "\nwith {0:,} rows\nand {1:,} columns"
      .format(df2.shape[0], df2.shape[1]) + 
      "\n-- Column names:\n", df2.columns)

#### Drop records that violate referential integrity

In [None]:
pk = 'tconst'
fk = 'parentTconst'
mask = df2[fk].isin(df1[pk])
old_len = len(df2)
df2 = df2.drop(df2[~mask].index)
print("Values of df2 not present in the Primary Key of df1 were dropped!"
      "\n{0:,} rows out of {1:,} were removed. {2:,} rows remaining."
      .format(old_len - len(df2),
              old_len,
              len(df2)))

#### Save results to file

In [None]:
save_path = data_dir + 'ref_integrity/' + file_name[:-4] + '_ri.csv'
t = time()
df2.to_csv(save_path, index=False)
elapsed = time() - t
print("DataFrame saved to file:\n", save_path,
      "\ntook {0:.2f} seconds".format(elapsed))

### $\texttt{title.basics}$ (PK: $\texttt{tconst}$) and $\texttt{title.crew}$ (FK: $\texttt{tconst}$)
#### Read tables

In [None]:
t = time()
file_name = 'movies_public_title.basics.csv'
df1 = pd.read_csv(ri_dir + file_name, delimiter=',',
                 na_values='\\N')
elapsed = time() - t
print("----- DataFrame loaded"
      "\nin {0:.2f} seconds".format(elapsed) + 
      "\nwith {0:,} rows\nand {1:,} columns"
      .format(df1.shape[0], df1.shape[1]) + 
      "\n-- Column names:\n", df1.columns)

In [None]:
t = time()
file_name = 'title.crew.tsv'
df2 = pd.read_csv(tsv_dir + file_name, delimiter='\t',
                 na_values='\\N')
elapsed = time() - t
print("----- DataFrame loaded"
      "\nin {0:.2f} seconds".format(elapsed) + 
      "\nwith {0:,} rows\nand {1:,} columns"
      .format(df2.shape[0], df2.shape[1]) + 
      "\n-- Column names:\n", df2.columns)

#### Drop records that violate referential integrity

In [None]:
pk = 'tconst'
fk = 'tconst'
mask = df2[fk].isin(df1[pk])
old_len = len(df2)
df2 = df2.drop(df2[~mask].index)
print("Values of df2 not present in the Primary Key of df1 were dropped!"
      "\n{0:,} rows out of {1:,} were removed. {2:,} rows remaining."
      .format(old_len - len(df2),
              old_len,
              len(df2)))

#### Save results to file

In [None]:
save_path = data_dir + 'ref_integrity/' + file_name[:-4] + '_ri.csv'
t = time()
df2.to_csv(save_path, index=False)
elapsed = time() - t
print("DataFrame saved to file:\n", save_path,
      "\ntook {0:.2f} seconds".format(elapsed))

### $\texttt{name.basics}$ (PK: $\texttt{nconst}$) and $\texttt{title.crew}$ (FKs: $\texttt{writers},~\texttt{directors}$)
#### Read tables

In [4]:
t = time()
file_name = 'movies_public_name.basics.csv'
df1 = pd.read_csv(ri_dir + file_name, delimiter=',',
                 na_values='\\N')
elapsed = time() - t
print("----- DataFrame loaded"
      "\nin {0:.2f} seconds".format(elapsed) + 
      "\nwith {0:,} rows\nand {1:,} columns"
      .format(df1.shape[0], df1.shape[1]) + 
      "\n-- Column names:\n", df1.columns)

----- DataFrame loaded
in 15.28 seconds
with 9,460,059 rows
and 6 columns
-- Column names:
 Index(['nconst', 'primaryname', 'birthyear', 'deathyear', 'primaryprofession',
       'knownfortitles'],
      dtype='object')


In [5]:
t = time()
file_name = 'title.crew.tsv'
df2 = pd.read_csv(tsv_dir + file_name, delimiter='\t',
                 na_values='\\N')
elapsed = time() - t
print("----- DataFrame loaded"
      "\nin {0:.2f} seconds".format(elapsed) + 
      "\nwith {0:,} rows\nand {1:,} columns"
      .format(df2.shape[0], df2.shape[1]) + 
      "\n-- Column names:\n", df2.columns)

----- DataFrame loaded
in 5.60 seconds
with 6,028,571 rows
and 3 columns
-- Column names:
 Index(['tconst', 'directors', 'writers'], dtype='object')


In [None]:
title_crew_writers_list = df2['writers'].dropna().str.split(',').sum()

#### Drop records that violate referential integrity

In [16]:
pk = 'tconst'
fk = 'tconst'
mask = df2[fk].isin(df1[pk])
old_len = len(df2)
df2 = df2.drop(df2[~mask].index)
print("Values of df2 not present in the Primary Key of df1 were dropped!"
      "\n{0:,} rows out of {1:,} were removed. {2:,} rows remaining."
      .format(old_len - len(df2),
              old_len,
              len(df2)))

Values of df2 not present in the Primary Key of df1 were dropped!
5,153 rows out of 6,028,571 were removed. 6,023,418 rows remaining.


#### Save results to file

In [17]:
save_path = data_dir + 'ref_integrity/' + file_name[:-4] + '_ri.csv'
t = time()
df2.to_csv(save_path, index=False)
elapsed = time() - t
print("DataFrame saved to file:\n", save_path,
      "\ntook {0:.2f} seconds".format(elapsed))

DataFrame saved to file:
 ../../data/imdb/ref_integrity/title.crew_ri.csv 
took 9.20 seconds


### $\texttt{title.basics}$ (PK: $\texttt{tconst}$) and $\texttt{title.principals}$ (FK: $\texttt{tconst}$)
#### Read tables

In [5]:
t = time()
file_name = 'movies_public_title.basics.csv'
df1 = pd.read_csv(ri_dir + file_name, delimiter=',',
                 na_values='\\N')
elapsed = time() - t
print("----- DataFrame loaded"
      "\nin {0:.2f} seconds".format(elapsed) + 
      "\nwith {0:,} rows\nand {1:,} columns"
      .format(df1.shape[0], df1.shape[1]) + 
      "\n-- Column names:\n", df1.columns)

----- DataFrame loaded
in 13.81 seconds
with 6,023,418 rows
and 9 columns
-- Column names:
 Index(['tconst', 'titletype', 'primarytitle', 'originaltitle', 'isadult',
       'startyear', 'endyear', 'runtimeminutes', 'genres'],
      dtype='object')


In [6]:
t = time()
file_name = 'title.principals.tsv'
df2 = pd.read_csv(tsv_dir + file_name, delimiter='\t',
                 na_values='\\N')
elapsed = time() - t
print("----- DataFrame loaded"
      "\nin {0:.2f} seconds".format(elapsed) + 
      "\nwith {0:,} rows\nand {1:,} columns"
      .format(df2.shape[0], df2.shape[1]) + 
      "\n-- Column names:\n", df2.columns)

----- DataFrame loaded
in 39.18 seconds
with 34,780,589 rows
and 6 columns
-- Column names:
 Index(['tconst', 'ordering', 'nconst', 'category', 'job', 'characters'], dtype='object')


#### Drop records that violate referential integrity

In [7]:
t = time()
pk = 'tconst'
fk = 'tconst'
mask = df2[fk].isin(df1[pk])
old_len = len(df2)
df2 = df2.drop(df2[~mask].index)
elapsed = time() - t
print("Values of df2 not present in the Primary Key of df1 were dropped!"
      "\n{0:,} rows out of {1:,} were removed. {2:,} rows remaining."
      .format(old_len - len(df2),
              old_len,
              len(df2)),
      "\nTook {0:,.2f} seconds.".format(elapsed))

Values of df2 not present in the Primary Key of df1 were dropped!
25,284 rows out of 34,780,589 were removed. 34,755,305 rows remaining. 
Took 67.05 seconds.


### $\texttt{name.basics}$ (PK: $\texttt{nconst}$) and $\texttt{title.principals}$ (FK: $\texttt{nconst}$)

In [None]:
t = time()
file_name = 'movies_public_name.basics.csv'
df1 = pd.read_csv(ri_dir + file_name, delimiter=',',
                 na_values='\\N')
elapsed = time() - t
print("----- DataFrame loaded"
      "\nin {0:.2f} seconds".format(elapsed) + 
      "\nwith {0:,} rows\nand {1:,} columns"
      .format(df1.shape[0], df1.shape[1]) + 
      "\n-- Column names:\n", df1.columns)

#### Drop records that violate referential integrity

In [None]:
t = time()
pk = 'nconst'
fk = 'nconst'
mask = df2[fk].isin(df1[pk])
old_len = len(df2)
df2 = df2.drop(df2[~mask].index)
elapsed = time() - t
print("Values of df2 not present in the Primary Key of df1 were dropped!"
      "\n{0:,} rows out of {1:,} were removed. {2:,} rows remaining."
      .format(old_len - len(df2),
              old_len,
              len(df2)),
      "\nTook {0:,.2f} seconds.".format(elapsed))

#### Save results to file

In [7]:
save_path = data_dir + 'ref_integrity/' + file_name[:-4] + '_ri.csv'
t = time()
df2.to_csv(save_path, index=False)
elapsed = time() - t
print("DataFrame saved to file:\n", save_path,
      "\ntook {0:.2f} seconds".format(elapsed))

DataFrame saved to file:
 ../../data/imdb/ref_integrity/title.principals_ri.csv 
took 92.26 seconds
