## Setup

In [2]:
import pandas as pd
import boto3
import json
from sqlalchemy import create_engine

In [3]:
def get_secret(secret_name, region_name="us-east-1"):
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name)
    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    get_secret_value_response = json.loads(get_secret_value_response['SecretString'])
    return get_secret_value_response

creds = get_secret("wysde")
USERNAME = creds["RDS_POSTGRES_USERNAME"]
PASSWORD = creds["RDS_POSTGRES_PASSWORD"]
HOST = creds["RDS_POSTGRES_HOST"]
DATABASE = 'sparsh'

CONN = f"postgresql://{USERNAME}:{PASSWORD}@{HOST}:5432/{DATABASE}"

engine = create_engine(CONN)
conn = engine.connect()

In [9]:
TABLE = "people_analytics"

In [8]:
%config SqlMagic.autopandas=True
%config SqlMagic.displaycon=False
%config SqlMagic.feedback=False
%config SqlMagic.displaylimit=5
%reload_ext sql
%sql {CONN}

## Data Ingestion

In [4]:
df = pd.read_csv("./data/people_analytics_start.csv")
df.head()

Unnamed: 0,employee_id,department,sub-department,first_level_manager,second_level_manager,third_level_manager,fourth_level_manager,job_level,gender,sexual_orientation,...,location_city,marital_status,employment_status,salary,hire_date,term_date,tenure,term_type,term_reason,active_status
0,4566010041,Sales,Business Development,2169536929,12104572130,,,Manager,Male,Heterosexual,...,San Jose,Married,Full Time,101989,2017-06-28,,68,,,1
1,7563277100,Software,Technical Support,9617891304,2870084555,9368442131.0,12104570000.0,Individual Contributor,Male,Heterosexual,...,Chicago,Single,Full Time,98059,2012-05-27,,130,,,1
2,901750037,Finance,Accounting,9453398201,8876294130,3225775822.0,12104570000.0,Individual Contributor,Male,Heterosexual,...,Austin,Married,Full Time,65444,2017-02-12,,72,,,1
3,5969184373,Marketing,Public Relations,1486955622,4057075270,2813655685.0,12104570000.0,Individual Contributor,Female,Heterosexual,...,Seattle,Single,Full Time,90060,2020-06-27,,31,,,1
4,3294917953,Software,QA,9606060417,6220406640,2107182373.0,12104570000.0,Individual Contributor,Female,Heterosexual,...,San Antonio,Single,Full Time,56973,2014-09-08,,102,,,1


In [27]:
df.columns = [col.replace('-','_') for col in df.columns]
df.head()

Unnamed: 0,employee_id,department,sub_department,first_level_manager,second_level_manager,third_level_manager,fourth_level_manager,job_level,gender,sexual_orientation,...,location_city,marital_status,employment_status,salary,hire_date,term_date,tenure,term_type,term_reason,active_status
0,4566010041,Sales,Business Development,2169536929,12104572130,,,Manager,Male,Heterosexual,...,San Jose,Married,Full Time,101989,2017-06-28,,68,,,1
1,7563277100,Software,Technical Support,9617891304,2870084555,9368442131.0,12104570000.0,Individual Contributor,Male,Heterosexual,...,Chicago,Single,Full Time,98059,2012-05-27,,130,,,1
2,901750037,Finance,Accounting,9453398201,8876294130,3225775822.0,12104570000.0,Individual Contributor,Male,Heterosexual,...,Austin,Married,Full Time,65444,2017-02-12,,72,,,1
3,5969184373,Marketing,Public Relations,1486955622,4057075270,2813655685.0,12104570000.0,Individual Contributor,Female,Heterosexual,...,Seattle,Single,Full Time,90060,2020-06-27,,31,,,1
4,3294917953,Software,QA,9606060417,6220406640,2107182373.0,12104570000.0,Individual Contributor,Female,Heterosexual,...,San Antonio,Single,Full Time,56973,2014-09-08,,102,,,1


In [28]:
df.to_sql(TABLE, engine, method='multi', index=False, if_exists='replace')

4138

In [29]:
%%sql

SELECT * FROM {TABLE} LIMIT 10

Unnamed: 0,employee_id,department,sub_department,first_level_manager,second_level_manager,third_level_manager,fourth_level_manager,job_level,gender,sexual_orientation,...,location_city,marital_status,employment_status,salary,hire_date,term_date,tenure,term_type,term_reason,active_status
0,4566010041,Sales,Business Development,2169536929,12104572130,,,Manager,Male,Heterosexual,...,San Jose,Married,Full Time,101989,2017-06-28,,68,,,1
1,7563277100,Software,Technical Support,9617891304,2870084555,9368442131.0,12104570000.0,Individual Contributor,Male,Heterosexual,...,Chicago,Single,Full Time,98059,2012-05-27,,130,,,1
2,901750037,Finance,Accounting,9453398201,8876294130,3225775822.0,12104570000.0,Individual Contributor,Male,Heterosexual,...,Austin,Married,Full Time,65444,2017-02-12,,72,,,1
3,5969184373,Marketing,Public Relations,1486955622,4057075270,2813655685.0,12104570000.0,Individual Contributor,Female,Heterosexual,...,Seattle,Single,Full Time,90060,2020-06-27,,31,,,1
4,3294917953,Software,QA,9606060417,6220406640,2107182373.0,12104570000.0,Individual Contributor,Female,Heterosexual,...,San Antonio,Single,Full Time,56973,2014-09-08,,102,,,1
5,9733816619,Finance,Financial Planning,823014046,6711801205,5312370235.0,12104570000.0,Individual Contributor,Female,Heterosexual,...,Indianapolis,Single,Full Time,59564,2014-03-31,,107,,,1
6,1363350907,Software,Technical Support,4422482610,3331241071,6268712051.0,12104570000.0,Individual Contributor,Male,Heterosexual,...,Los Angeles,Single,Full Time,72231,2019-08-04,,42,,,1
7,8079838913,HR,Training,2010047725,977946997,3315940256.0,12104570000.0,Individual Contributor,Female,Heterosexual,...,Washington DC,Single,Full Time,54679,2018-03-21,,59,,,1
8,8456966186,Administration,Facilities,8834412962,4096113579,8652820129.0,12104570000.0,Individual Contributor,Female,Heterosexual,...,Washington DC,Single,Full Time,56138,2017-07-17,2019-06-20,23,Voluntary,More flexible benefits,0
9,2266152100,Procurement,Vendor Management,5226775059,1055062301,3570683418.0,12104570000.0,Individual Contributor,Female,Heterosexual,...,Fort Worth,Single,Full Time,64551,2015-04-14,,95,,,1


In [13]:
%%sql

--Count the employees in each department
SELECT
	department,
	COUNT(employee_id) AS total_staff
FROM {TABLE}
GROUP BY department;

Unnamed: 0,department,total_staff
0,Executive,1
1,Marketing,407
2,R&D,187
3,Product Development,186
4,Operations,395
5,Finance,365
6,HR,401
7,Sales,635
8,Customer Service,174
9,Legal,202


In [15]:
%%sql

--Counts the number of departments and the length of the longest department name
SELECT
	COUNT(DISTINCT(department)) AS number_of_departments,
	MAX(LENGTH(department)) AS max_department_name_length
FROM {TABLE};

Unnamed: 0,number_of_departments,max_department_name_length
0,13,19


In [30]:
%%sql

--Selects all departments and sub departments
SELECT
	department,
	sub_department
FROM {TABLE}
GROUP BY sub_department, department
ORDER BY department;

Unnamed: 0,department,sub_department
0,Administration,Executive Assistants
1,Administration,Facilities
2,Administration,Travel
3,Customer Service,Technical Support
4,Customer Service,Customer Support
5,Customer Service,Account Management
6,Executive,
7,Finance,Accounting
8,Finance,Auditing
9,Finance,Financial Planning


In [31]:
%%sql

--Counts the number of sub departments and the length of the longest sub department name
SELECT
	COUNT(DISTINCT(sub_department)) AS number_of_sub_departments,
	MAX(LENGTH(sub_department)) AS max_sub_department_name_length
FROM {TABLE};

Unnamed: 0,number_of_sub_departments,max_sub_department_name_length
0,32,21


NOTE
> This table holds a list of unique departments and sub departments to which each employee will belong to exactly one. The SERIAL data type is used as the primary key so that ID's are auto-generated. The longest name is 21 characters so I'll give a max of 30 characters for both fields; I can always extend it in future. You'll notice a bunch of contraints applied here. After I ensured there were no null or empty department values in the main table I moved to constrain the field. We want to make sure future additions to this table always provide a unique name for the department. There also appears to be another business constraint. There are mutliple sub departments with the same name but with different parent departments. I'll create a UNIQUE constraint to ensure that duplicate combinations of the two columns can not be added.

In [19]:
%%sql

--Create a department dictionary table
CREATE TABLE pa_departments (
	id SERIAL PRIMARY KEY,
	department     VARCHAR(30) NOT NULL CHECK (department <> ''),
	sub_department VARCHAR(30) NOT NULL CHECK (sub_department <> ''),
	UNIQUE(department, sub_department)
);

In [34]:
%%sql

--Insert all departments and sub departments into the new deapartments table
INSERT INTO pa_departments (
	department,
	sub_department
)
SELECT
	department,
	sub_department    
FROM {TABLE}
WHERE sub_department != ''
GROUP BY department, sub_department;