In [1]:
import sqlite3
import csv
import pandas as pd

# Connect to the DB, Read data

In [2]:
connection = sqlite3.connect("HR.db")
cursor = connection.cursor()

In [3]:
# Fetch employees table and read it in df format
# It's ok to fetch all data since the dataset is small
query = 'SELECT * FROM EMPLOYEES;'
df = pd.read_sql_query(query, connection)
df

Unnamed: 0,EMP_ID,F_NAME,L_NAME,SSN,B_DATE,SEX,ADDRESS,JOB_ID,SALARY,MANAGER_ID,DEP_ID
0,E1001,John,Thomas,123456,1976-09-01,M,"5631 Rice, OakPark,IL",100,100000,30001,2
1,E1002,Alice,James,123457,1972-07-31,F,"980 Berry ln, Elgin,IL",200,80000,30002,5
2,E1003,Steve,Wells,123458,1980-10-08,M,"291 Springs, Gary,IL",300,50000,30002,5
3,E1004,Santosh,Kumar,123459,1985-07-20,M,"511 Aurora Av, Aurora,IL",400,60000,30002,5
4,E1005,Ahmed,Hussain,123410,1981-04-01,M,"216 Oak Tree, Geneva,IL",500,70000,30001,2
5,E1006,Nancy,Allen,123411,1978-06-02,F,"111 Green Pl, Elgin,IL",600,90000,30001,2
6,E1007,Mary,Thomas,123412,1975-05-05,F,"100 Rose Pl, Gary,IL",650,65000,30003,7
7,E1008,Bharath,Gupta,123413,1985-06-05,M,"145 Berry Ln, Naperville,IL",660,65000,30003,7
8,E1009,Andrea,Jones,123414,1990-09-07,F,"120 Fall Creek, Gary,IL",234,70000,30003,7
9,E1010,Ann,Jacob,123415,1982-03-30,F,"111 Britany Springs,Elgin,IL",220,70000,30002,5


# Salary Based Analysis

In [4]:
# Employees whose salaries are lower than the avg salary
query = '''SELECT * FROM EMPLOYEES 
        WHERE SALARY < (SELECT AVG(SALARY) FROM EMPLOYEES);'''

df = pd.read_sql_query(query, connection)
df

Unnamed: 0,EMP_ID,F_NAME,L_NAME,SSN,B_DATE,SEX,ADDRESS,JOB_ID,SALARY,MANAGER_ID,DEP_ID
0,E1003,Steve,Wells,123458,1980-10-08,M,"291 Springs, Gary,IL",300,50000,30002,5
1,E1004,Santosh,Kumar,123459,1985-07-20,M,"511 Aurora Av, Aurora,IL",400,60000,30002,5
2,E1005,Ahmed,Hussain,123410,1981-04-01,M,"216 Oak Tree, Geneva,IL",500,70000,30001,2
3,E1007,Mary,Thomas,123412,1975-05-05,F,"100 Rose Pl, Gary,IL",650,65000,30003,7
4,E1008,Bharath,Gupta,123413,1985-06-05,M,"145 Berry Ln, Naperville,IL",660,65000,30003,7
5,E1009,Andrea,Jones,123414,1990-09-07,F,"120 Fall Creek, Gary,IL",234,70000,30003,7
6,E1010,Ann,Jacob,123415,1982-03-30,F,"111 Britany Springs,Elgin,IL",220,70000,30002,5


In [5]:
# Individual salaries vs max salary
query = '''SELECT EMP_ID, SALARY, (SELECT MAX(SALARY) FROM EMPLOYEES) AS MAX_SALARY FROM EMPLOYEES;'''

df = pd.read_sql_query(query, connection)
df

Unnamed: 0,EMP_ID,SALARY,MAX_SALARY
0,E1001,100000,100000
1,E1002,80000,100000
2,E1003,50000,100000
3,E1004,60000,100000
4,E1005,70000,100000
5,E1006,90000,100000
6,E1007,65000,100000
7,E1008,65000,100000
8,E1009,70000,100000
9,E1010,70000,100000


In [6]:
# Create a sub-table of the top 5 earners of the company, then calculate their average salary
query = '''SELECT AVG(SALARY) 
        FROM (SELECT SALARY 
        FROM EMPLOYEES 
        ORDER BY SALARY DESC 
        LIMIT 5) AS SALARY_TABLE;'''

df = pd.read_sql_query(query, connection)
df

Unnamed: 0,AVG(SALARY)
0,82000.0


In [7]:
# Create a sub-table of the bottom 5 earners of the company, then calculate their average salary
query = '''SELECT AVG(SALARY) 
        FROM (SELECT SALARY 
        FROM EMPLOYEES 
        ORDER BY SALARY 
        LIMIT 5) AS SALARY_TABLE;'''

df = pd.read_sql_query(query, connection)
df

Unnamed: 0,AVG(SALARY)
0,62000.0


# Grouping Employees Based on Birth Date

In [8]:
# Oldest employee
query = '''SELECT F_NAME, L_NAME, B_DATE FROM EMPLOYEES
        WHERE B_DATE = (SELECT MIN(B_DATE) FROM EMPLOYEES);'''

df = pd.read_sql_query(query, connection)
df

Unnamed: 0,F_NAME,L_NAME,B_DATE
0,Alice,James,1972-07-31


In [9]:
# Find the employees older than the average of the age of the employees
# Age of each employee is calculated as julianday(date('now')) - julianday(B_DATE)
query = '''SELECT * FROM EMPLOYEES 
        WHERE julianday(date('now')) - julianday(B_DATE) > 
        (SELECT AVG(julianday(date('now')) - julianday(B_DATE)) FROM EMPLOYEES)
        ORDER BY B_DATE;'''

df = pd.read_sql_query(query, connection)
df

Unnamed: 0,EMP_ID,F_NAME,L_NAME,SSN,B_DATE,SEX,ADDRESS,JOB_ID,SALARY,MANAGER_ID,DEP_ID
0,E1002,Alice,James,123457,1972-07-31,F,"980 Berry ln, Elgin,IL",200,80000,30002,5
1,E1007,Mary,Thomas,123412,1975-05-05,F,"100 Rose Pl, Gary,IL",650,65000,30003,7
2,E1001,John,Thomas,123456,1976-09-01,M,"5631 Rice, OakPark,IL",100,100000,30001,2
3,E1006,Nancy,Allen,123411,1978-06-02,F,"111 Green Pl, Elgin,IL",600,90000,30001,2
4,E1003,Steve,Wells,123458,1980-10-08,M,"291 Springs, Gary,IL",300,50000,30002,5


# Grouping Employees Based on Years Spent in the Company

In [10]:
# Employees, the number of years they were in the company, avg service year of all employees 
query = '''SELECT *, strftime('%Y', 'now') - strftime('%Y', START_DATE) AS SERVICE_YEAR, 
            (SELECT AVG(strftime('%Y', 'now') - strftime('%Y', START_DATE)) 
            FROM JOB_HISTORY) AS AVG_SERVICE_YEAR
        FROM JOB_HISTORY;'''

df = pd.read_sql_query(query, connection)
df

Unnamed: 0,EMPL_ID,START_DATE,JOBS_ID,DEPT_ID,SERVICE_YEAR,AVG_SERVICE_YEAR
0,E1001,2000-08-01,100,2,24,19.3
1,E1002,2001-08-01,200,5,23,19.3
2,E1003,2001-08-16,300,5,23,19.3
3,E1004,2000-08-16,400,5,24,19.3
4,E1005,2000-05-30,500,2,24,19.3
5,E1006,2001-08-16,600,2,23,19.3
6,E1007,2002-05-30,650,7,22,19.3
7,E1008,2010-05-06,660,7,14,19.3
8,E1009,2016-08-16,234,7,8,19.3
9,E1010,2016-08-16,220,5,8,19.3


In [11]:
# Employees who worked in the company more than the average of the service year of all employees
query = '''SELECT *, strftime('%Y', 'now') - strftime('%Y', START_DATE) AS SERVICE_YEAR FROM JOB_HISTORY
        WHERE strftime('%Y', 'now') - strftime('%Y', START_DATE) >
            (SELECT AVG(strftime('%Y', 'now') - strftime('%Y', START_DATE)) FROM JOB_HISTORY);'''

df = pd.read_sql_query(query, connection)
df

Unnamed: 0,EMPL_ID,START_DATE,JOBS_ID,DEPT_ID,SERVICE_YEAR
0,E1001,2000-08-01,100,2,24
1,E1002,2001-08-01,200,5,23
2,E1003,2001-08-16,300,5,23
3,E1004,2000-08-16,400,5,24
4,E1005,2000-05-30,500,2,24
5,E1006,2001-08-16,600,2,23
6,E1007,2002-05-30,650,7,22


In [12]:
# Employees who worked in the company more than the average of the service year of all employees 
# with department id = 2
query = '''SELECT *, strftime('%Y', 'now') - strftime('%Y', START_DATE) AS SERVICE_YEAR FROM JOB_HISTORY
        WHERE strftime('%Y', 'now') - strftime('%Y', START_DATE) >
            (SELECT AVG(strftime('%Y', 'now') - strftime('%Y', START_DATE)) FROM JOB_HISTORY)
            AND DEPT_ID = '2';'''

df = pd.read_sql_query(query, connection)
df

Unnamed: 0,EMPL_ID,START_DATE,JOBS_ID,DEPT_ID,SERVICE_YEAR
0,E1001,2000-08-01,100,2,24
1,E1005,2000-05-30,500,2,24
2,E1006,2001-08-16,600,2,23


In [13]:
# Close connection
cursor.close()
connection.close()