# Creating a Bus Rapid Transit (BRT) Database

A Database Engineering project using PostgreSQL

In [1]:
import boto3
import json
from sqlalchemy import create_engine

In [2]:
def get_secret(secret_name='wysde'):
    region_name = "us-east-1"
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name)
    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    get_secret_value_response = json.loads(get_secret_value_response['SecretString'])
    return get_secret_value_response

In [24]:
secret_vals = get_secret()

postgres_endpoint = secret_vals['RDS_POSTGRES_HOST']
postgres_user = secret_vals['RDS_POSTGRES_USERNAME']
postgres_pass = secret_vals['RDS_POSTGRES_PASSWORD']
port = secret_vals['RDS_POSTGRES_PORT']
dbname = "postgres"

engine_string = "postgresql+psycopg2://%s:%s@%s:%s/%s" \
% (postgres_user, postgres_pass, postgres_endpoint, port, dbname)
engine = create_engine(engine_string)

In [46]:
%config SqlMagic.autopandas=True
%config SqlMagic.displaycon=False
%config SqlMagic.feedback=False
%config SqlMagic.displaylimit=5
%reload_ext sql

In [53]:
%sql $engine_string

## DDL

In [16]:
%%sql

CREATE TABLE "drivers" (
  "id" SERIAL PRIMARY KEY,
  "first_name" varchar(20) NOT NULL,
  "last_name" varchar(20) NOT NULL,
  "gender" char(1) NOT NULL CHECK(gender IN ('M', 'F', '0')),
  "email_address" varchar(50) UNIQUE NOT NULL,
  "address" varchar(100) NOT NULL,
  "date_of_birth" date CHECK((date_part('year', NOW()) - date_part('year', date_of_birth)) > 25), -- This validates that any driver recorded should be above 25 years of age
  "national_identity_number" char(11) UNIQUE NOT NULL
);

CREATE TABLE "passengers" (
  "id" SERIAL PRIMARY KEY,
  "first_name" varchar(20) NOT NULL,
  "last_name" varchar(20) NOT NULL,
  "gender" char(1) NOT NULL CHECK(gender IN ('M', 'F', 'O')),
  "email_address" varchar(50) UNIQUE NOT NULL,
  "address" varchar(100) NOT NULL 
);

CREATE TABLE "vehicles" (
  "id" SERIAL PRIMARY KEY,
  "vin" varchar(20) UNIQUE NOT NULL,
  "plate_number" varchar(10) UNIQUE NOT NULL,
  "model" varchar(50) NOT NULL,
  "capacity" integer NOT NULL,
  "status" varchar(20) NOT NULL CHECK(status IN ('Available', 'In use', 'Under maintenance', 'Out of use'))
);

In [17]:
%%sql

CREATE TABLE "trips" (
  "id" SERIAL PRIMARY KEY,
  "departure_location" varchar(20) NOT NULL,
  "arrival_location" varchar(20) NOT NULL CHECK(arrival_location <> departure_location), -- This validates that arrival location is not equal to departure location
  "departure_time" timestamp NOT NULL,
  "arrival_time" timestamp NOT NULL CHECK(arrival_time > departure_time), -- This ensures that arrival time is greater than departure time
  "driver_id" integer NOT NULL,
  "trip_capacity" integer NOT NULL CHECK(trip_capacity > 0), -- This ensures that no driver conveys zero passengers
  FOREIGN KEY ("driver_id") REFERENCES "drivers" ("id") ON UPDATE CASCADE ON DELETE CASCADE
);

CREATE TABLE "passenger_cards" (
  "id" SERIAL PRIMARY KEY,
  "card_number" varchar(10) UNIQUE NOT NULL,
  "issue_date" date DEFAULT NOW(),
  "phone_number" varchar(12) UNIQUE NOT NULL,
  "passenger_id" integer NOT NULL,
  FOREIGN KEY ("passenger_id") REFERENCES "passengers" ("id") ON DELETE CASCADE ON UPDATE CASCADE
);

CREATE TABLE "passenger_trips" (
  "id" SERIAL,
  "passenger_id" integer NOT NULL,
  "trip_id" integer NOT NULL,
  "price" decimal NOT NULL,
  PRIMARY KEY ("passenger_id", "trip_id"), -- Composite keys bearing the not null and unique attributes due the primary key characteristics
  FOREIGN KEY ("passenger_id") REFERENCES "passengers" ("id") ON UPDATE CASCADE ON DELETE CASCADE,
  FOREIGN KEY ("trip_id") REFERENCES "trips" ("id") ON DELETE RESTRICT -- Deleting from the trips table would lead to a violation of this foreign key constraint
);

CREATE TABLE "driver_vehicle_logs" (
  "id" SERIAL PRIMARY KEY,
  "driver_id" integer NOT NULL,
  "vehicle_id" integer NOT NULL,
  "pair_date" date DEFAULT NOW(), -- Left untouched for future records. 
  FOREIGN KEY ("driver_id") REFERENCES "drivers" ("id") ON UPDATE CASCADE ON DELETE CASCADE,
  FOREIGN KEY ("vehicle_id") REFERENCES "vehicles" ("id") ON UPDATE CASCADE ON DELETE CASCADE
);

CREATE TABLE "driver_identification_cards" (
  "id" SERIAL,
  "card_no" char(5) PRIMARY KEY,
  "issue_date" date DEFAULT NOW(),
  "driver_id" integer UNIQUE NOT NULL, -- Enforcing a 1-1 relationship 
  FOREIGN KEY ("driver_id") REFERENCES "drivers" ("id") ON UPDATE CASCADE ON DELETE CASCADE
);

CREATE TABLE "driver_additional_details" (
  "id" SERIAL PRIMARY KEY,
  "phone_number" varchar(12) UNIQUE NOT NULL,
  "driver_id" integer NOT NULL,
  FOREIGN KEY ("driver_id") REFERENCES "drivers" ("id") ON UPDATE CASCADE ON DELETE CASCADE
);

CREATE TABLE "license" (
  "id" SERIAL PRIMARY KEY,
  "license_number" varchar(12) UNIQUE NOT NULL,
  "issue_date" date NOT NULL CHECK((date_part('year', NOW()) - date_part('year', issue_date)) > 1),
  "expiry_date" date NOT NULL CHECK(expiry_date > issue_date),
  "license_status" varchar(7) DEFAULT('TBD'), -- Left untouched
  "driver_id" integer UNIQUE, -- Enforcing a 1-1 relationship
  FOREIGN KEY ("driver_id") REFERENCES "drivers" ("id")  ON UPDATE CASCADE ON DELETE CASCADE
);

In [20]:
%%sql

UPDATE license
SET license_status = (CASE WHEN date_part('year', NOW()) > date_part('year', expiry_date) THEN 'Expired' ELSE 'Valid' END)
WHERE license_status = 'TBD';

## DML

In [None]:
engine_string = "postgresql+psycopg2://%s:%s@%s:%s/%s" \
% (postgres_user, postgres_pass, postgres_endpoint, port, dbname)

In [None]:
!PGPASSWORD=$postgres_pass psql --host=$postgres_endpoint --port=$port --username=$postgres_user --dbname=$dbname

In [None]:
\copy drivers(id, first_name, last_name, gender, email_address, address, date_of_birth, national_identity_number)
from './data/drivers.csv'
with delimiter ','
csv header;

In [None]:
\copy passengers(id, first_name, last_name, gender, email_address, address)
from './data/passengers.csv'
with delimiter ','
csv header;

In [None]:
\copy vehicles(id, vin, plate_number, model, capacity, status)
from './data/vehicles.csv'
with delimiter ','
csv header;

In [None]:
\copy trips(id, departure_location, arrival_location, departure_time, arrival_time, driver_id, trip_capacity)
from './data/trips.csv'
with delimiter ','
csv header;

In [None]:
\copy passenger_cards(id, card_number, issue_date, phone_number, passenger_id)
from './data/passenger_cards.csv'
with delimiter '	'
csv header;

In [None]:
\copy passenger_trips(id, passenger_id, trip_id, price)
from './data/passenger_trips.csv'
with delimiter ','
csv header;

In [None]:
\copy driver_vehicle_logs(id, driver_id, vehicle_id, pair_date)
from './data/driver_vehicle_logs.csv'
with delimiter ','
csv header;

In [None]:
\copy driver_identification_cards(id, card_no, issue_date, driver_id)
from './data/driver_identification_cards.csv'
with delimiter ','
csv header;

In [None]:
\copy driver_additional_details(id, phone_number, driver_id)
from './data/driver_additional_details.csv'
with delimiter '	'
csv header;

In [None]:
\copy license(id, license_number, issue_date, expiry_date, license_status, driver_id)
from './data/license.csv'
with delimiter ','
csv header;

## Running Test Queries

In order to test the functionality of the database, I ran some simple queries. One of which was to run a query that aims to find the top ten passengers with the highest number of trips. The company is looking to run a discount for frequent passengers with respect to the number of trips taken. Note that the database at this point is already normalized so we’d have to join two tables in this case; the passenger_trips and the passenger's table. The query would look like this:

In [50]:
%%sql

WITH CTE1 AS 
(
  SELECT 
   DISTINCT passenger_id,
   COUNT(trip_id) as no_of_trips
  FROM passenger_trips
  GROUP BY distinct passenger_id
)

SELECT 
 first_name,
 last_name,
 gender,
 email_address,
 no_of_trips
FROM passengers p
JOIN CTE1
 ON CTE1.passenger_id = p.id
ORDER BY no_of_trips DESC
LIMIT 10;

Unnamed: 0,first_name,last_name,gender,email_address,no_of_trips
0,Chibuike,Afolayan,F,Chibuike.Afolayan@example.com,13
1,Chinyere,Chikere,F,Chinyere.Chikere@example.com,12
2,Ekene,Olujimi,F,Ekene.Olujimi@example.com,10
3,Udo,Dayo,M,Udo.Dayo@example.com,10
4,Chidike,Afolabi,F,Chidike.Afolabi@example.com,10
5,Nnamdi,Chizoba,M,Nnamdi.Chizoba@example.com,9
6,Gbemisola,Uju,M,Gbemisola.Uju@example.com,9
7,Alheri,Abeni,F,Alheri.Abeni@example.com,9
8,Ogechukwu,Yejide,F,Ogechukwu.Yejide@example.com,9
9,Olayinka,Ndidi,M,Olayinka.Ndidi@example.com,8


Or we would like to continue with the vehicle—driver pairings so we want to know the available vehicles, we could run a query like this:

In [51]:
%%sql

SELECT *
FROM vehicles
WHERE status = 'Available';

Unnamed: 0,id,vin,plate_number,model,capacity,status
0,11,6EEZYNBREQY832389,FST-967TC,Marcopolo Scania,40,Available
1,12,1QQOBLYFSUM724853,LSR-897KO,Marcopolo Ideale,50,Available
2,18,5DMYGWOZJTV514160,LSR-287VA,Marcopolo Ideale,50,Available
3,19,4DPRIZUXSLX671267,AAA-615NC,Primero,60,Available
4,23,8GJRLZJGHBP298090,AAA-318RT,Primero,60,Available
5,33,7QWTLYJTMVP939100,FST-527QU,Marcopolo Scania,40,Available
6,38,6XTBVHQJGWP755658,AAA-811QT,Primero,60,Available
7,40,3BHDQKDPSVQ725547,LSR-155SW,Marcopolo Ideale,50,Available
8,41,4QZUABTOQYL753134,LSR-375SI,Marcopolo Ideale,50,Available
9,42,8SGLRAZRNIK393794,FST-283ON,Marcopolo Scania,40,Available


## Creating Indexes

So far, we have been able to run basic tests on our newly formed database with simple queries, but most times searching and filtering operations would begin to underperform as our database scales vertically. Since we have a fairly small database with the largest tuples of our dataset of about 1000, we could disregard the need for adding indexes as the DBMS would rather run a sequential scan than a bitmap index (especially in cases where the filter returns multiple values). However, we should consider the possibility of scaling and hence indexes were added.

In [54]:
%%sql

-- INDEX DEFINITION

DROP INDEX IF EXISTS brt_vehicles_id_idx;
CREATE INDEX brt_vehicles_id_idx
ON vehicles(id);

CLUSTER vehicles USING brt_vehicles_id_idx;

DROP INDEX IF EXISTS brt_driver_vehicle_logs_driver_id_vehicle_id_idx;
CREATE INDEX brt_driver_vehicle_logs_driver_id_vehicle_id_idx
ON driver_vehicle_logs (driver_id, vehicle_id);

DROP INDEX IF EXISTS brt_drivers_first_name_last_name_idx;
CREATE INDEX brt_drivers_first_name_last_name_idx
ON drivers (first_name, last_name);

DROP INDEX IF EXISTS brt_driver_identification_cards_driver_id;
CREATE INDEX brt_driver_identification_cards_driver_id
ON driver_identification_cards (driver_id);

DROP INDEX IF EXISTS brt_driver_additional_details_driver_id_idx;
CREATE INDEX brt_driver_additional_details_driver_id_idx
ON driver_additional_details (driver_id);

DROP INDEX IF EXISTS brt_license_driver_id;
CREATE INDEX brt_license_driver_id
ON license (driver_id);

DROP INDEX IF EXISTS brt_trips_id;
CREATE INDEX brt_trips_id
ON trips (id);

CLUSTER trips USING brt_trips_id;

DROP INDEX IF EXISTS brt_trips_driver_id;
CREATE INDEX brt_trips_driver_id
ON trips (driver_id);

DROP INDEX IF EXISTS brt_passenger_trips_passenger_id_trip_id_idx;
CREATE INDEX brt_passenger_trips_passenger_id_trip_id_idx
ON passenger_trips (passenger_id, trip_id);

DROP INDEX IF EXISTS brt_passenger_cards_passenger_id_idx;
CREATE INDEX brt_passenger_cards_passenger_id_idx
ON passenger_cards (passenger_id);