# Rachel Ney-Grimm Midterm Project - Sakila Datamart

First, I imported libraries and set up the connection to the MySQL Server and MongoDB, where I will retrieve data from. Then I added the functions for getting and setting database data, and I created the Sakila_2 database, the destination of the ETL pipeline.

In [120]:
#libraries
import os
import numpy
import pandas as pd
from sqlalchemy import create_engine
import json
import datetime
import pymongo

#connection setup
host_name = "localhost"
host_ip = "127.0.0.1"
port = "3306"
user_id = "root"
pwd = "Passw0rd123"

#setup for mongodb connection
mysql_uid = "root" #root when not in cloud
mysql_pwd = "Passw0rd123"

atlas_cluster_name = "sandbox.zibbf"
atlas_user_name = "m001-student"
atlas_password = "m001-mongodb-basics"

#database source and destination
src_dbname = "sakila"
dst_dbname = "sakila_2"

#database function definitions
def get_mongo_dataframe(connect_str, db_name, collection, query):
    '''Create a connection to MongoDB'''
    client = pymongo.MongoClient(connect_str)
    
    '''Query MongoDB, and fill a python list with documents to create a DataFrame'''
    db = client[db_name]
    dframe = pd.DataFrame(list(db[collection].find(query)))
    dframe.drop(['_id'], axis=1, inplace=True)
    client.close()
    return dframe

def get_sql_dataframe(user_id, pwd, host_name, db_name, sql_query):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    dframe = pd.read_sql(sql_query, connection);
    connection.close()
    return dframe

def set_dataframe(user_id, pwd, host_name, db_name, df, table_name, pk_column, db_operation):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    if db_operation == "insert":
        df.to_sql(table_name, con=connection, index=False, if_exists='replace')
        sqlEngine.execute(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});")
            
    elif db_operation == "update":
        df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
    connection.close()
    
#creating new database    
conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}"
sqlEngine = create_engine(conn_str, pool_recycle=3600)

sqlEngine.execute(f"DROP DATABASE IF EXISTS `{dst_dbname}`;")
sqlEngine.execute(f"CREATE DATABASE `{dst_dbname}`;")
sqlEngine.execute(f"USE {dst_dbname};")

#finish mongodb connection setup 
conn_str = {"local" : f"mongodb://localhost:27017/",
    "atlas" : f"mongodb+srv://{atlas_user_name}:{atlas_password}@{atlas_cluster_name}.mongodb.net"
}

print(f"Local Connection String: {conn_str['local']}")
print(f"Atlas Connection String: {conn_str['atlas']}")

Local Connection String: mongodb://localhost:27017/
Atlas Connection String: mongodb+srv://m001-student:m001-mongodb-basics@sandbox.zibbf.mongodb.net


## Customer Dimension Table
I created a 'customer' dimension table from data originating at the MySQL sakila database. I transformed this data by renaming what will be table's key, and dropping the columns that aren't needed in the new schema.

In [194]:
#extracting the data from the source with a sql select statement into a dataframe
sql_customers = "SELECT * FROM sakila.customer;"
df_customers = get_sql_dataframe(user_id, pwd, host_name, src_dbname, sql_customers)
df_customers.head(2)

Unnamed: 0,customer_id,store_id,first_name,last_name,email,address_id,active,create_date,last_update
0,1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5,1,2006-02-14 22:04:36,2006-02-15 04:57:20
1,2,1,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,6,1,2006-02-14 22:04:36,2006-02-15 04:57:20


In [195]:
#rename id column to fit data warehouse standard, drop undesired columns
df_customers.rename(columns={"customer_id":"customer_key"}, inplace=True)
df_customers.drop(['store_id'], axis=1, inplace=True)
df_customers.head(2)

Unnamed: 0,customer_key,first_name,last_name,email,address_id,active,create_date,last_update
0,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5,1,2006-02-14 22:04:36,2006-02-15 04:57:20
1,2,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,6,1,2006-02-14 22:04:36,2006-02-15 04:57:20


In [196]:
#for adding address to the customer dimension tables
#bring address, city, and country tables in as dataframes
#drop columns that are empty ('address2') or contain binary large objects that won't be conducive to analysis (location')
sql_address = "SELECT * FROM sakila.address;"
df_address = get_sql_dataframe(user_id, pwd, host_name, src_dbname, sql_address)
df_address.drop(['location', 'address2', 'last_update'], axis=1, inplace=True)

sql_city = "SELECT city_id , city , country_id FROM sakila.city;"
df_city = get_sql_dataframe(user_id, pwd, host_name, src_dbname, sql_city)

sql_country = "SELECT country_id, country FROM sakila.country;"
df_country = get_sql_dataframe(user_id, pwd, host_name, src_dbname, sql_country)

In [197]:
#merge country into city
df_city= pd.merge(df_city, df_country, on='country_id', how='left')#or inner
#df_city.head(2)

#merge city into address
df_address= pd.merge(df_address, df_city, on='city_id', how='left')
#df_address.head(2)

#merge address into the customer table - end goal
df_customers= pd.merge(df_customers, df_address, on='address_id', how='left')

#the address information is now attached in the dimension table
#so we can drop the ids of the merged tables
df_customers.drop(['address_id','city_id', 'country_id'], axis=1, inplace=True)
df_customers.head(2)

Unnamed: 0,customer_key,first_name,last_name,email,active,create_date,last_update,address,district,postal_code,phone,city,country
0,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,1,2006-02-14 22:04:36,2006-02-15 04:57:20,1913 Hanoi Way,Nagasaki,35200,28303384290,Sasebo,Japan
1,2,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,1,2006-02-14 22:04:36,2006-02-15 04:57:20,1121 Loja Avenue,California,17886,838635286649,San Bernardino,United States


In [198]:
#load into data warehouse
#dataframe -> tables
db_operation = "insert"
tables = [('dim_customers', df_customers, 'customer_key')]
for table_name, dataframe, primary_key in tables:
    set_dataframe(user_id, pwd, host_name, dst_dbname, dataframe, table_name, primary_key, db_operation)
    
sql_dim_customers = "SELECT * FROM sakila_2.dim_customers;"
df_dim_customers = get_sql_dataframe(mysql_uid, mysql_pwd, host_name, dst_dbname, sql_dim_customers)
df_dim_customers.head(2)

Unnamed: 0,customer_key,first_name,last_name,email,active,create_date,last_update,address,district,postal_code,phone,city,country
0,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,1,2006-02-14 22:04:36,2006-02-15 04:57:20,1913 Hanoi Way,Nagasaki,35200,28303384290,Sasebo,Japan
1,2,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,1,2006-02-14 22:04:36,2006-02-15 04:57:20,1121 Loja Avenue,California,17886,838635286649,San Bernardino,United States


### Date Dimension Table
A date dimension table for the sakila data warehouse was created using the script from lab2c, run in MySQL workbench. The result can be viewed below.

In [46]:
'''note: if the earlier cell that drops/creates sakila_2 is run between when the script is run in MySQL, 
    this code will crash as there will be no dim_date table to select from.'''

sql_dim_date = "SELECT * FROM sakila_2.dim_date;"
df_dim_date = get_sql_dataframe(user_id, pwd, host_name, src_dbname, sql_dim_date)
df_dim_date.head()

Unnamed: 0,date_key,full_date,date_name,date_name_us,date_name_eu,day_of_week,day_name_of_week,day_of_month,day_of_year,weekday_weekend,...,is_last_day_of_month,calendar_quarter,calendar_year,calendar_year_month,calendar_year_qtr,fiscal_month_of_year,fiscal_quarter,fiscal_year,fiscal_year_month,fiscal_year_qtr
0,20000101,2000-01-01,2000/01/01,01/01/2000,01/01/2000,7,Saturday,1,1,Weekend,...,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
1,20000102,2000-01-02,2000/01/02,01/02/2000,02/01/2000,1,Sunday,2,2,Weekend,...,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
2,20000103,2000-01-03,2000/01/03,01/03/2000,03/01/2000,2,Monday,3,3,Weekday,...,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
3,20000104,2000-01-04,2000/01/04,01/04/2000,04/01/2000,3,Tuesday,4,4,Weekday,...,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
4,20000105,2000-01-05,2000/01/05,01/05/2000,05/01/2000,4,Wednesday,5,5,Weekday,...,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3


### Film Dimension Table
Data on the film's language is retrieved from the file system. The film data itself is retrieved from MongoDB. Simple transformations (dropping unnecessary or duplicate columns, renaming the key) are made. The film dataframe is then merged with the language dataframe on the shared language_id column.

In [161]:
#get the sakila language data from the file system
#will later merge to become part of the film dimension table
data_file = os.path.join(os.getcwd(), 'sakila_language_data.csv')
df_language = pd.read_csv(data_file, header=0, index_col=0)
df_language.drop(['last_update'], axis=1, inplace=True)
df_language.head()

Unnamed: 0_level_0,name
language_id,Unnamed: 1_level_1
1,English
2,Italian
3,Japanese
4,Mandarin
5,French


In [162]:
#get sakila film data from mongodb
query = {}
collection = "film"
df_film = get_mongo_dataframe(conn_str["local"], src_dbname, collection, query)#select everthing from film use to make a df
df_film.head(2)

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,1,,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,1,,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2006-02-15 05:03:42


In [164]:
#perform initial transormations on film data
#drop data we are not interested in for analyzing the business processes
df_film.drop(['original_language_id'], axis=1, inplace=True)
df_film.rename(columns={"film_id":"film_key"}, inplace=True) #dont want id to be called just id
df_film.head(2) #id column now film_key

Unnamed: 0,film_key,title,description,release_year,language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,1,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,1,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2006-02-15 05:03:42


In [165]:
#merge language and film on language id (left) using pandas
df_film = pd.merge(df_film, df_language, on='language_id', how='left')#or inner
df_film.rename(columns={"name":"film_language"}, inplace=True) #rename the column containing the language of the film to be more intuitive
df_film.drop(['language_id'], axis=1, inplace=True)
df_film.head(2)


Unnamed: 0,film_key,title,description,release_year,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update,film_language
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42,English
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2006-02-15 05:03:42,English


In [166]:
#load the film dataframe into newly created film table in datawarehouse, 
dataframe = df_film
table_name = 'dim_film'
primary_key = 'film_key'
db_operation = "insert"
set_dataframe(mysql_uid, mysql_pwd, host_name, dst_dbname, dataframe, table_name, primary_key, db_operation) #from

#validate that it was created and loaded successfully
sql_film = "SELECT * FROM sakila_2.dim_film;"
df_dim_film = get_sql_dataframe(mysql_uid, mysql_pwd, host_name, dst_dbname, sql_film)
df_dim_film.head(2)

Unnamed: 0,film_key,title,description,release_year,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update,film_language
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42,English
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2006-02-15 05:03:42,English


## Fact Table
The fact table is based primarily off of the rental process that is modeled by the sakila database. It also includes data pertaining to the payments for the rentals to provide more quantitative information on the transaction. To create this fact table, sakila's rental table and payment table were joined using the pandas merge method.

In [184]:
sql_rental = "SELECT * FROM sakila.rental;"
df_rental = get_sql_dataframe(user_id, pwd, host_name, src_dbname, sql_rental)
df_rental.head(2)

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update
0,1,2005-05-24 22:53:30,367,130,2005-05-26 22:04:30,1,2006-02-15 21:30:53
1,2,2005-05-24 22:54:33,1525,459,2005-05-28 19:40:33,1,2006-02-15 21:30:53


In [185]:
sql_payment = "SELECT amount, rental_id, payment_date FROM sakila.payment;"
df_payment = get_sql_dataframe(user_id, pwd, host_name, src_dbname, sql_payment)
df_payment.head(2)

Unnamed: 0,amount,rental_id,payment_date
0,2.99,76,2005-05-25 11:30:37
1,0.99,573,2005-05-28 10:35:23


In [186]:
df_rental = pd.merge(df_rental, df_payment, on='rental_id', how='left')
df_rental.head()
#reorder columns
#delete columns i dont want
#rename the key

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update,amount,payment_date
0,1,2005-05-24 22:53:30,367,130,2005-05-26 22:04:30,1,2006-02-15 21:30:53,2.99,2005-05-24 22:53:30
1,2,2005-05-24 22:54:33,1525,459,2005-05-28 19:40:33,1,2006-02-15 21:30:53,2.99,2005-05-24 22:54:33
2,3,2005-05-24 23:03:39,1711,408,2005-06-01 22:12:39,1,2006-02-15 21:30:53,3.99,2005-05-24 23:03:39
3,4,2005-05-24 23:04:41,2452,333,2005-06-03 01:43:41,2,2006-02-15 21:30:53,4.99,2005-05-24 23:04:41
4,5,2005-05-24 23:05:21,2079,222,2005-06-02 04:33:21,1,2006-02-15 21:30:53,6.99,2005-05-24 23:05:21


In [188]:
column_name_map = {"rental_id" : "rental_key",
                   "customer_id" : "customer_key"
                  }
df_rental.rename(columns=column_name_map, inplace=True)
df_rental.drop(['inventory_id','staff_id'], axis=1, inplace=True)

df_rental=df_rental[['rental_key','customer_key','amount','rental_date','return_date','payment_date','last_update']]
df_rental.insert(0, "fact_rental_key", range(1, df_rental.shape[0]+1))#create primary key in that table, create auto increment column
df_rental.head(2)

Unnamed: 0,fact_rental_key,rental_key,customer_key,amount,rental_date,return_date,payment_date,last_update
0,1,1,130,2.99,2005-05-24 22:53:30,2005-05-26 22:04:30,2005-05-24 22:53:30,2006-02-15 21:30:53
1,2,2,459,2.99,2005-05-24 22:54:33,2005-05-28 19:40:33,2005-05-24 22:54:33,2006-02-15 21:30:53


In [189]:
dataframe = df_rental
table_name = 'fact_rental'
primary_key = 'fact_rental_key'
db_operation = "insert"

set_dataframe(mysql_uid, mysql_pwd, host_name, dst_dbname, dataframe, table_name, primary_key, db_operation)


In [191]:
sql_rental = "SELECT * FROM sakila_2.fact_rental;"
df_fact_rental = get_sql_dataframe(mysql_uid, mysql_pwd, host_name, dst_dbname, sql_rental)
df_fact_rental.head(2)

Unnamed: 0,fact_rental_key,rental_key,customer_key,amount,rental_date,return_date,payment_date,last_update
0,1,1,130,2.99,2005-05-24 22:53:30,2005-05-26 22:04:30,2005-05-24 22:53:30,2006-02-15 21:30:53
1,2,2,459,2.99,2005-05-24 22:54:33,2005-05-28 19:40:33,2005-05-24 22:54:33,2006-02-15 21:30:53
