In [1]:
import psycopg2

In [None]:
HOST = ""
USER = ""
PASSWORD = ""
DATABASE = ""
SCHEMA = ""

In [2]:
conn = psycopg2.connect(dbname=DATABASE, user=USER, password=PASSWORD)
cur = conn.cursor()
cur.execute(f'CREATE SCHEMA {SCHEMA};')

## Exploring the Dataset File

I opened the file `boston.csv` and extracted the header (column names) and first data rows. 

In [3]:
import csv
with open('boston.csv', 'r') as f:
    reader = csv.reader(f)
    col_headers = next(reader)
    first_row = next(reader)

I created the function `get_col_value_set()` to produce a Python set that contains all the distinct values in a given column. `get_col_value_set` accepts 2 inputs: the filename (`csv_filename`) and index of the desired column (`col_index`). I then used `get_col_value_set` to calculate the number of distinct values per column of the dataset. 

In [4]:
def get_col_value_set(csv_filename, col_index):
    import csv
    with open(csv_filename, 'r') as f:
        next(f) # skip the header
        reader = csv.reader(f)
        
        distinct_values = set()
        for row in reader:
            distinct_values.add(row[col_index])
    
    return distinct_values

for index in range(len(col_headers)):
    distinct_values = get_col_value_set('boston.csv', index)
    print(col_headers[index], len(distinct_values))

print('\n')
print(col_headers)
print(first_row)
    

incident_number 298329
offense_code 219
description 239
date 1177
day_of_the_week 7
lat 18177
long 18177


['incident_number', 'offense_code', 'description', 'date', 'day_of_the_week', 'lat', 'long']
['1', '619', 'LARCENY ALL OTHERS', '2018-09-02', 'Sunday', '42.35779134', '-71.13937053']


From the results shown above, values in `incident_number` and `offense_code` columns can be represented using the data type `integer`, while those in `lat` and `long` columns can be expressed as `decimal`. Columns `description` and `day_of_the_week` show string values. The data type `varchar(n)` can be used to represent the values in the `description` column, but I need to specify the maximum number of characters `n`. 

I determined the maximum length of string values in the column `description`  to be `58` as shown below. Thus, `n` could be any number larger than `58`. 

Due to the small number of distinct values in the `day_of_the_week` column, I decided to use the *enumerated* data type. Lastly, for the column `date`, I used the data type `date`. Please see the PostgreSQL documentation on [data types](https://www.postgresql.org/docs/9.5/datatype.html) for more information. 

In [5]:
col_values = get_col_value_set('boston.csv', 2)
max_len = 0
for value in col_values:
    max_len = max(max_len, len(value))

print(max_len)

58


## Creating the Table with Appropriate Data Types

Before creating the table, I first created the data type `DAY_ENUM` to define the 7 distinct values found in the column `day_of_the_week`. I then created the table `boston_crimes` using the following data types:

* `incident_number` - `INTEGER`
* `offense_code` - `INTEGER`
* `description` - `VARCHAR(100)`
* `date` - `DATE`
* `day_of_the_week` - `DAY_ENUM`
* `lat` - `DECIMAL`
* `long` - `DECIMAL`

In [6]:
cur.execute('''
    CREATE TYPE DAY_ENUM AS ENUM (
        'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday');
''')

cur.execute(f'''
    CREATE TABLE {SCHEMA}.boston_crimes (
        incident_number INTEGER PRIMARY KEY, 
        offense_code INTEGER, 
        description VARCHAR(100), 
        date DATE, 
        day_of_the_week DAY_ENUM, 
        lat DECIMAL, 
        long DECIMAL);
''')

## Loading of Data into the Table

I now loaded the data from `boston.csv` into the table `boston_crimes` which is under schema `crimes`. I printed the first row of the table and the number of rows to check if the loading of data is successful. I also observed that the computed number of rows (`298329`) is the same as the number of distinct values previously calculated in the column `incident_number`. This also proves the successful loading of information from `boston.csv` to table `boston_crimes`. 

In [7]:
with open('boston.csv', 'r') as f:
    cur.copy_expert('COPY crimes.boston_crimes FROM STDIN WITH CSV HEADER;', f)
cur.execute('SELECT * FROM crimes.boston_crimes;')
data = cur.fetchall()
print(data[0])
print(len(data))

(1, 619, 'LARCENY ALL OTHERS', datetime.date(2018, 9, 2), 'Sunday', Decimal('42.35779134'), Decimal('-71.13937053'))
298329


## Creating User Groups with Appropriate Privileges

Before creating the user groups, I made sure that there are no privileges inherited from the `public` group and on the `public` schema following the [least privilege principle](https://www.upguard.com/blog/principle-of-least-privilege). 

In [8]:
cur.execute('REVOKE ALL ON SCHEMA public FROM public;')
cur.execute(f'REVOKE ALL ON DATABASE {DATABASE} FROM public;')

I created 2 user groups: `readonly` and `readwrite`. For both groups, I granted the following privileges:

* Connection to the `crime_db` database
* Usage of the `crimes` schema
* Selection of data from all tables in schema `crimes`

For specific privileges, only the `readwrite` group can insert, delete, and update data in all tables in schema `crimes`. 

In [9]:
cur.execute('CREATE GROUP readonly NOLOGIN;')
cur.execute(f'GRANT CONNECT ON DATABASE {DATABASE} TO readonly;')
cur.execute('GRANT USAGE ON SCHEMA crimes TO readonly;')
cur.execute('GRANT SELECT ON ALL TABLES IN SCHEMA crimes TO readonly;')

cur.execute('CREATE GROUP readwrite NOLOGIN;')
cur.execute(f'GRANT CONNECT ON DATABASE {DATABASE} TO readwrite;')
cur.execute('GRANT USAGE ON SCHEMA crimes TO readwrite;')
cur.execute('GRANT SELECT, INSERT, DELETE, UPDATE ON ALL TABLES IN SCHEMA crimes TO readwrite;')

## Creating User for each Group

I created users `data_analyst` and `data_scientist` and assigned them to `readonly` and `readwrite` groups, respectively. 

In [10]:
cur.execute("CREATE USER data_analyst WITH PASSWORD 'secret1';")
cur.execute('GRANT readonly TO data_analyst;')

cur.execute("CREATE USER data_scientist WITH PASSWORD 'secret2';")
cur.execute('GRANT readwrite TO data_scientist;')

conn.commit()
conn.close()

## Testing the Database

I used SQL queries to check if correct objects were created and that users and groups have the right privileges. First, I queried the [pg_roles](https://www.postgresql.org/docs/10/view-pg-roles.html) view to give information about the following database roles: `postgres`, `readonly`, `readwrite`, `data_analyst`, and `data_scientist` (the column `rolname`). The results show the following:

* Only the role `postgres` has superuser privileges (`rolsuper`), can create more roles (`rolcreaterole`), can create databases (`rolcreatedb`), and can log in (`rolcanlogin`).
* As user groups, roles `readonly` and `readwrite` has no superuser privileges, can't create more roles and databases, and can't log in. 
* Both `data_analyst` and `data_scientist` can only log in. 

In [11]:
conn2 = psycopg2.connect(dbname=DATABASE, user=USER, password=PASSWORD)
cur2 = conn2.cursor()

cur2.execute(""" 
    SELECT
        rolname,
        rolsuper,
        rolcreaterole,
        rolcreatedb,
        rolcanlogin
    FROM pg_roles
    WHERE rolname IN ('postgres', 'readonly', 'readwrite', 'data_analyst', 'data_scientist');
""")
cur2.fetchall()

[('postgres', True, True, True, True),
 ('readonly', False, False, False, False),
 ('readwrite', False, False, False, False),
 ('data_analyst', False, False, False, True),
 ('data_scientist', False, False, False, True)]

I also SQL-queried the [table_privileges](https://www.postgresql.org/docs/9.1/infoschema-table-privileges.html) view to identify all privileges granted on tables or views to enabled roles. The results of the SQL query show that the `readonly` group can only select data from the table `boston_crimes` under schema `crimes` in database `crime_db`.  On the other hand, aside from selecting data, the `readwrite` group can also insert, update, and delete information in the `boston_crimes` table. 

In [12]:
cur2.execute("""
    SELECT 
        grantor,
        grantee,
        table_catalog,
        table_schema,
        table_name,
        privilege_type
    FROM information_schema.table_privileges
    WHERE grantee IN ('readonly', 'readwrite');
""")
cur2.fetchall()

[('postgres', 'readonly', 'crime_db', 'crimes', 'boston_crimes', 'SELECT'),
 ('postgres', 'readwrite', 'crime_db', 'crimes', 'boston_crimes', 'INSERT'),
 ('postgres', 'readwrite', 'crime_db', 'crimes', 'boston_crimes', 'SELECT'),
 ('postgres', 'readwrite', 'crime_db', 'crimes', 'boston_crimes', 'UPDATE'),
 ('postgres', 'readwrite', 'crime_db', 'crimes', 'boston_crimes', 'DELETE')]

# Conclusion

In this project, I built a database using PostgreSQL and created a table with proper data types for storing the information from a CSV file. I also created user groups with appropriate privileges. I created users and assigned them to these groups. Lastly, I verified if the privileges of user groups were set correctly. 