### Data Cleaning and Transformation

In [27]:
# Importing Libraries
import sqlite3

# create connection  and cursor object
connection = sqlite3.connect("pizza_runner_database")
cursor = connection.cursor()

# see the tables in database
query ="select name from sqlite_master  where type='table';"
cursor.execute(query)
result = cursor.fetchall()
print('Tables: {}'.format(result))

Tables: [('runners',), ('customer_orders',), ('runner_orders',), ('pizza_names',), ('pizza_recipes',), ('pizza_toppings',)]


The simple ER diagram of database is:
<br>
<img src="images/erdiagram.png" alt="ER Diagram" width="500">

#### for customer_orders table


First let's visualize the data of customer_orders table and match it as per requirement of ER diagram mentioned above

In [28]:
query= """
select * from customer_orders;
"""
cursor.execute(query)
result = cursor.fetchall()
for record in result:
    print(record)

(1, 101, 1, '', '', '2020-01-01 18:05:02')
(2, 101, 1, '', '', '2020-01-01 19:00:52')
(3, 102, 1, '', '', '2020-01-02 23:51:23')
(3, 102, 2, '', None, '2020-01-02 23:51:23')
(4, 103, 1, '4', '', '2020-01-04 13:23:46')
(4, 103, 1, '4', '', '2020-01-04 13:23:46')
(4, 103, 2, '4', '', '2020-01-04 13:23:46')
(5, 104, 1, 'null', '1', '2020-01-08 21:00:29')
(6, 101, 2, 'null', 'null', '2020-01-08 21:03:13')
(7, 105, 2, 'null', '1', '2020-01-08 21:20:29')
(8, 102, 1, 'null', 'null', '2020-01-09 23:54:33')
(9, 103, 1, '4', '1, 5', '2020-01-10 11:22:59')
(10, 104, 1, 'null', 'null', '2020-01-11 18:34:49')
(10, 104, 1, '2, 6', '1, 4', '2020-01-11 18:34:49')


In exclusions and extras columns, we see that not every cell has a value. None value represent actual null value whereas there is 'null' string used also. Let's make them of one type so they dont cause any problem. Lets create a view fro updated data. We can create another table if required also.

In [29]:
#for customer_orders table
cursor.execute("DROP View IF EXISTS c_view;")
query= """
create view c_view as 

select order_id, customer_id, pizza_id,
case
    when exclusions = 'null' or TRIM(exclusions) ="" then NULL
    else exclusions
end as exclusions,
case
    when extras = 'null' or TRIM(extras) ="" then NULL
    else extras
end as extras,
order_time

from customer_orders;


"""
cursor.execute(query)


<sqlite3.Cursor at 0x26369359fc0>

In [30]:
query= """
select * from customer_view;
"""
cursor.execute(query)
result = cursor.fetchall()
for record in result:
    print(record)

(1, 101, 1, '', '', '2020-01-01 18:05:02')
(2, 101, 1, '', '', '2020-01-01 19:00:52')
(3, 102, 1, '', '', '2020-01-02 23:51:23')
(3, 102, 2, '', '', '2020-01-02 23:51:23')
(4, 103, 1, '4', '', '2020-01-04 13:23:46')
(4, 103, 1, '4', '', '2020-01-04 13:23:46')
(4, 103, 2, '4', '', '2020-01-04 13:23:46')
(5, 104, 1, '', '1', '2020-01-08 21:00:29')
(6, 101, 2, '', '', '2020-01-08 21:03:13')
(7, 105, 2, '', '1', '2020-01-08 21:20:29')
(8, 102, 1, '', '', '2020-01-09 23:54:33')
(9, 103, 1, '4', '1, 5', '2020-01-10 11:22:59')
(10, 104, 1, '', '', '2020-01-11 18:34:49')
(10, 104, 1, '2, 6', '1, 4', '2020-01-11 18:34:49')


#### for runners table


First let's visualize the data of runners table and match it as per requirement of ER diagram mentioned above

In [31]:
query= """
select * from runners;
"""
cursor.execute(query)
result = cursor.fetchall()
for record in result:
    print(record)

(1, '2021-01-01')
(2, '2021-01-03')
(3, '2021-01-08')
(4, '2021-01-15')


Everything is fine in this table.

#### for runner_orders table


First let's visualize the data of runner_orders table and match it as per requirement of ER diagram mentioned above

In [32]:
query= """
select * from runner_orders;
"""
cursor.execute(query)
result = cursor.fetchall()
for record in result:
    print(record)

(1, 1, '2020-01-01 18:15:34', '20km', '32 minutes', '')
(2, 1, '2020-01-01 19:10:54', '20km', '27 minutes', '')
(3, 1, '2020-01-03 00:12:37', '13.4km', '20 mins', None)
(4, 2, '2020-01-04 13:53:03', '23.4', '40', None)
(5, 3, '2020-01-08 21:10:57', '10', '15', None)
(6, 3, 'null', 'null', 'null', 'Restaurant Cancellation')
(7, 2, '2020-01-08 21:30:45', '25km', '25mins', 'null')
(8, 2, '2020-01-10 00:15:02', '23.4 km', '15 minute', 'null')
(9, 2, 'null', 'null', 'null', 'Customer Cancellation')
(10, 1, '2020-01-11 18:50:20', '10km', '10minutes', 'null')


We can see that "null" as string and null used in cancellation column, we need to address this. The data type of columns pickup_time, distcance an duration should be timestamp, numeric and integer. in distance column there is km so we need to address that, and in  duration field we have alphabets we need to remove them.


Removing these let's create another view named "ro_view"

In [33]:
cursor.execute("DROP View IF EXISTS r_view;")
query= """
create view r_view as 

select order_id, runner_id,
case
    when pickup_time  = 'null'  then NULL
    else datetime(pickup_time)
end as pickup_time,
CASE
     WHEN distance = 'null' or TRIM(distance)= "" THEN NULL
     WHEN distance LIKE '%km' THEN cast(REPLACE(distance, 'km', '') as real)
     ELSE cast(distance as real) 
   END AS distance,
CASE
     WHEN duration = 'null' or TRIM(duration)="" THEN NULL
     WHEN duration LIKE '%mins' THEN cast(REPLACE(duration, 'mins', '') as integer)
     WHEN duration LIKE '%minute' THEN cast(REPLACE(duration, 'minute', '') as integer)
     WHEN duration LIKE '%minutes' THEN cast(REPLACE(duration, 'minutes', '') as integer)
     ELSE cast(duration as integer)
   END AS duration,
case
    when cancellation = 'null' or TRIM(cancellation) =""then NULL
    else cancellation
end as cancellation

from runner_orders;


"""
cursor.execute(query)

<sqlite3.Cursor at 0x26369359fc0>

In [34]:
query= """
select * from ro_view;
"""
cursor.execute(query)
result = cursor.fetchall()
for record in result:
    print(record)

(1, 1, '2020-01-01 18:15:34', 20.0, 32, '')
(2, 1, '2020-01-01 19:10:54', 20.0, 27, '')
(3, 1, '2020-01-03 00:12:37', 13.4, 20, '')
(4, 2, '2020-01-04 13:53:03', 23.4, 40, '')
(5, 3, '2020-01-08 21:10:57', 10.0, 15, '')
(6, 3, '', None, None, 'Restaurant Cancellation')
(7, 2, '2020-01-08 21:30:45', 25.0, 25, '')
(8, 2, '2020-01-10 00:15:02', 23.4, 15, '')
(9, 2, '', None, None, 'Customer Cancellation')
(10, 1, '2020-01-11 18:50:20', 10.0, 10, '')


Other tables are fine and we are okay with cleaning and data transformation.

In [35]:
if connection:
    connection.close()