In [None]:
# ETL PRoject

In [1]:
import pandas as pd
import psycopg2
import sqlalchemy
import numpy as np
import matplotlib as plt
from sqlalchemy import create_engine, inspect
from datetime import datetime

In [3]:
# Database connection 
pg_port = '5432'
pg_username = 'postgres'
pg_pw = 'postgres'
pg_db = 'movies'
postgresDSN = ('postgresql://{}:{}@{}:{}/{}'.format(pg_username, pg_pw, 'localhost', pg_port, pg_db))
engine = create_engine(postgresDSN)
cnx = engine.connect()

OperationalError: (psycopg2.OperationalError) FATAL:  database "movies" does not exist

(Background on this error at: http://sqlalche.me/e/13/e3q8)

In [None]:
# create the database schema

query = f""" 

-- Actors Table
Drop Table IF EXISTS Movie_Actor CASCADE;  
Drop Table IF EXISTS Ratings CASCADE;  
Drop Table IF EXISTS Movies CASCADE;  
Drop Table IF EXISTS Actors CASCADE;  

CREATE TABLE  Actors  (
     actor_id  INT   NOT NULL,
     first_name  VARCHAR(255)   NOT NULL,
     last_name  VARCHAR(255)   NOT NULL,
     birth_name  VARCHAR(255),
     date_of_birth  text,
     place_of_birth  text,
     date_of_death  text,
     reason_of_death  text,
    CONSTRAINT  pk_Actors  PRIMARY KEY (
         actor_id 
     )
);

-- Movies Table
CREATE TABLE  Movies  (
     movie_id  INT   NOT NULL,
     title  text   NOT NULL,
     year  INT   NOT NULL,
     genre  VARCHAR(255)   NOT NULL,
     duration  INT,
     language  text,
     director  VARCHAR(255),
     actors  text,
     description  text,
     avg_vote  float   NOT NULL,
     budget  float,
     gross_income  float,
     critic_reviews  float,
    CONSTRAINT  pk_Movies  PRIMARY KEY (
         movie_id 
     )
);

-- Ratings Table
CREATE TABLE  Ratings  (
     movie_id  INT   NOT NULL,
     avg_vote_females  float,
     avg_vote_males  float,
     total_votes  INT   NOT NULL,
     weighted_average_vote  float   NOT NULL,
    CONSTRAINT  pk_Ratings  PRIMARY KEY (
         movie_id 
     )
);

-- movie_actors table
CREATE TABLE  Movie_Actor  (
     id  serial   NOT NULL,
     movie_id  INT   NOT NULL,
     actor_id  INT   NOT NULL,
     role  text,
     characters  text,
    CONSTRAINT  pk_Movie_Actor  PRIMARY KEY (
         id 
     )
);

ALTER TABLE  Ratings  ADD CONSTRAINT  fk_Ratings_movie_id  FOREIGN KEY( movie_id )
REFERENCES  Movies  ( movie_id );

ALTER TABLE  Movie_Actor  ADD CONSTRAINT  fk_Movie_Actor_movie_id  FOREIGN KEY( movie_id )
REFERENCES  Movies  ( movie_id );

ALTER TABLE  Movie_Actor  ADD CONSTRAINT  fk_Movie_Actor_actor_id  FOREIGN KEY( actor_id )
REFERENCES  Actors  ( actor_id );

"""

In [None]:
data_source = 'Resources/'

In [None]:
movies_csv = data_source + 'IMDB movies.csv'
actors_csv = data_source + 'IMDB names.csv'
ratings_csv = data_source + 'IMDB ratings.csv'
title_principals = data_source + 'IMDb title_principals.csv'

In [None]:
actors = pd.read_csv(actors_csv, low_memory=False)

In [None]:
actors.head(3)

In [None]:
actors.rename(columns={'imdb_name_id':'actor_id'}, inplace=True)

In [None]:
actors_cols = ['actor_id','name','birth_name','date_of_birth','place_of_birth','date_of_death','reason_of_death']

In [None]:
actors = actors[actors_cols]

In [None]:
actors.head(3)

In [None]:
# Cleaning currency column
def clean_currency(x):
    """ If the value is a string with currency symbol, split the string and 
    return numeric value. Else return original input. 
    """
    if isinstance(x, str):
        try: 
            return x.split()[1]
        except IndexError as e:
            return x
    return(x)

In [None]:
# Clean date columns - reove string from text from date 
def clean_date(x):
    for i in str(x).split():
        if i.isdigit():
            return i
        if i.isalpha():
            pass
        else:
            return x

In [None]:
# Check actor id, remover 'nm' prefix and returns numeric portion. Else skips the row if its not in 'nm000123' format
def clean_id(x):
    if x[2:].isdigit() and x[0:2] == 'nm':
        return x[2:]
    else:
        pass

In [None]:
# Clean the data
# Do after data confirm
actors['actor_id'] = actors['actor_id'].apply(lambda x: x.replace('nm', ''))
#actors['actor_id'] = actors['actor_id'].apply(lambda x: clean_id(x))
actors['first_name'] = actors.loc[:,'name'].apply(lambda x: x.split()[0])
actors['last_name'] = actors.loc[:,'name'].apply(lambda x: x.split()[1:])
actors['last_name'] = actors.loc[:,'last_name'].astype(str).apply(lambda x: x.strip('][').replace("'", ""))
actors['date_of_birth'] = actors.loc[:,'date_of_birth'].apply(lambda x: clean_date(x))
actors['date_of_death'] = actors.loc[:,'date_of_death'].apply(lambda x: clean_date(x))
actors = actors[['actor_id','first_name','last_name','birth_name','date_of_birth','place_of_birth','date_of_death','reason_of_death']]

In [None]:
actors.head(3)

In [None]:
list(actors.columns)

### Movies Data 

In [None]:
# 31016 ; 85855
movies = pd.read_csv(movies_csv, low_memory=False)

In [None]:
rename_cols = {
    'imdb_title_id':'movie_id',
    'worlwide_gross_income':'gross_income',
    'reviews_from_critics': 'critic_reviews'
}

In [None]:
movies.rename(columns=rename_cols, inplace=True)

In [None]:
movie_cols = ['movie_id','title','year','genre','duration','language','director','actors', \
             'description', 'avg_vote', 'budget','gross_income','critic_reviews']

In [None]:
movies = movies[movie_cols]

In [None]:
movies.head(3)

In [None]:
movies['movie_id'] = movies['movie_id'].apply(lambda x: x.replace('tt', ''))
# movies['gross_income'] = movies.loc[:,'gross_income'].astype(str).apply(lambda x: x.replace('$',''))
#movies['budget'] = movies.loc[:,'budget'].astype(str).apply(lambda x: x.replace('$',''))
movies['gross_income'] = movies.loc[:,'gross_income'].apply(lambda x: clean_currency(x))
movies['budget'] = movies.loc[:,'budget'].apply(lambda x: clean_currency(x))
movies['year'] = movies.loc[:,'year'].apply(lambda x: clean_date(x))

In [None]:
movies.head(3)

### Testing pages

In [None]:
# '0002101'
movies.loc[movies['movie_id'] == '8206668']

In [None]:
movies.loc[movies['movie_id'] == '8206668']

In [None]:
# Drop index of row with bad data 
movies.drop(index=83917, inplace=True)

In [None]:
test

In [None]:
# nullable columns: language, director, 
# not nullable but nulls found: actors:69 rows - changed it to nullable for now; 
movies.loc[movies['budget'].isnull()]

In [None]:
#movies['budget'].isnull()

In [None]:
#movies.loc[movies['critic_reviews'].isna()]

In [None]:
movies.loc[:,'budget'].astype(str).apply(lambda x: x.split())

In [None]:
def dob(x):
    if isinstance(x, str):
        for i in x.split():
            if i.isdigit():
                return i
            if i.isalpha():
                pass
            else:
                return x
    

In [None]:
def check_id(x):
    if x[2:].isdigit() and x[0:2] == 'nm':
        return x[2:]
    else:
        print('fail')
       

In [None]:
check_id('hello')

In [None]:
y = 'the year 2.019'

In [None]:
y.split()

In [None]:
for i in y.split():
    #print(i.isdigit())
    #print(isinstance(i, float))
    #print(i.isalpha())
    #print(i.isnumeric())
    print(type(i))

In [None]:
actors.dtypes

In [None]:
t = '20-19-2020'

In [None]:
d = '9-09-2010'

mydate = datetime.strptime(d,'%d-%m-%Y').date()
mydate

In [None]:
a = '20-19-2020'
print(a.isnumeric())
print(a.isdigit())
print(a.isalpha())
print(a.isalnum())

In [None]:
p

In [None]:
# 000786
actors.loc[actors['actor_id'] == 'enoir character']

In [None]:
actors.drop(index=19222, inplace=True)

In [None]:
p = 'The year 2019'

In [None]:
for i in p.split():
    if i.isdigit():
        print(i)
    else:
        print('fail')

In [None]:
#test2 = movies['movie_id'].head(10)
test2 = movies[['movie_id']].head(10).copy()

In [None]:
test2.loc[test2['movie_id'] == '0002423'] = 

In [None]:
a = "year 2016"
b = a.split()

In [None]:
for i in b:
    print(i.isdigit())

### End Tests

In [None]:
list(movies.columns)

### Ratings data

In [None]:
ratings = pd.read_csv(ratings_csv)

In [None]:
rename_cols = {
    'imdb_title_id':'movie_id',
    'females_allages_avg_vote':'avg_vote_females',
    'males_allages_avg_vote':'avg_vote_males'
}

In [None]:
ratings.rename(columns=rename_cols, inplace=True)

In [None]:
ratings_cols = ['movie_id','avg_vote_females','avg_vote_males','total_votes','weighted_average_vote']

In [None]:
ratings = ratings[ratings_cols]

In [None]:
ratings['movie_id'] = ratings['movie_id'].apply(lambda x: x.replace('tt', ''))

In [None]:
ratings.head(3)

In [None]:
list(ratings.columns)

### movie_actors table

In [None]:
movie_actor = pd.read_csv(title_principals)

In [None]:
movie_actor.head(3)

In [None]:
rename_cols = {
    'imdb_title_id':'movie_id',
    'imdb_name_id':'actor_id',
    'category': 'role'
}

In [None]:
movie_actor.rename(columns=rename_cols, inplace=True)

In [None]:
movie_actor['movie_id'] = movie_actor['movie_id'].apply(lambda x: x.replace('tt', ''))
movie_actor['actor_id'] = movie_actor['actor_id'].apply(lambda x: x.replace('nm', ''))

In [None]:
movie_actor.head(3)

In [None]:
movie_actor = movie_actor[['movie_id','actor_id','role','characters']]

In [None]:
list(movie_actor.columns)

### Load Sql Tables

In [None]:
# read the data to tables:
movies.to_sql('movies', schema='public', con=cnx, if_exists='append', index=False)

In [None]:
actors.to_sql('actors', schema='public', con=cnx, if_exists='append', index=False)

In [None]:
ratings.to_sql('ratings', schema='public', con=cnx, if_exists='append', index=False)

In [None]:
movie_actor.to_sql('movie_actor', schema='public', con=cnx, if_exists='append', index=False)