## Importing data into the database.

### Import libraries

In [1]:
import os
import sqlite3
import pandas as pd
from sqlite3 import Error
from IPython.display import display

### Read the processed feather format data.

In [2]:
df = pd.read_feather("NYC-proc.feather")
df.drop("index", axis=1, inplace=True)
df.head(3)

Unnamed: 0,Fiscal Year,Payroll Number,Agency Name,Last Name,First Name,Agency Start Date,Work Location Borough,Title Description,Leave Status as of June 30,Base Salary,...,Regular Gross Paid,OT Hours,Total OT Paid,Total Other Pay,Regular Hours category,Total Paid,OT Hours category,Hourly Pay,Hourly Pay category,Total Pay category
0,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,BEREZIN,MIKHAIL,08/10/2015,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,...,84698.21,0.0,0.0,0.0,1500-2000,84698.21,0-500,46.537478,0-100,60000-90000
1,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,GEAGER,VERONICA,09/12/2016,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,...,84698.21,0.0,0.0,0.0,1500-2000,84698.21,0-500,46.537478,0-100,60000-90000
2,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,RAMANI,SHRADDHA,02/22/2016,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,...,84698.21,0.0,0.0,0.0,1500-2000,84698.21,0-500,46.537478,0-100,60000-90000


### Functions for creating connection, table and executing sql statements.

In [3]:
def create_connection(db_file, delete_db=False):
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        conn.execute("PRAGMA foreign_keys = 1")
    except Error as e:
        print(e)

    return conn

In [4]:
def create_table(conn, create_table_sql, drop_table_name=None):
    
    if drop_table_name:
        try:
            c = conn.cursor()
            c.execute("""DROP TABLE IF EXISTS %s""" % (drop_table_name))
        except Error as e:
            print(e)
    
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)

In [5]:
def execute_sql_statement(sql_statement, conn):
    cur = conn.cursor()
    cur.execute(sql_statement)

    rows = cur.fetchall()

    return rows

### Create database connection.

In [6]:
PATH = os.getcwd()
conn = create_connection(os.path.join(PATH, 'nyc-payroll.db'))

### Create tables.

##### Create table Fiscal Year.

In [7]:
sql = "CREATE TABLE FiscalYear (FiscalYear INTEGER NOT NULL PRIMARY KEY);"
create_table(conn, sql, drop_table_name='FiscalYear')

##### Create table Agency Name.

In [8]:
sql = "CREATE TABLE AgencyName (AgencyName TEXT NOT NULL PRIMARY KEY);"
create_table(conn, sql, drop_table_name='AgencyName')

##### Create table Pay Basis.

In [9]:
sql = "CREATE TABLE PayBasis (PayBasis TEXT NOT NULL PRIMARY KEY);"
create_table(conn, sql, drop_table_name='PayBasis')

##### Create table Pay Roll.

In [10]:
sql = "CREATE TABLE PayRoll (PayRollNo INTEGER NOT NULL PRIMARY KEY, AgencyName TEXT NOT NULL, FOREIGN KEY (AgencyName) REFERENCES AgencyName(AgencyName));"
create_table(conn, sql, drop_table_name='PayRoll')

##### Create table Employee. 

In [11]:
sql = "CREATE TABLE Employee (EmployeeID INTEGER NOT NULL PRIMARY KEY, FirstName TEXT NOT NULL, LastName TEXT NOT NULL, PayRollNo INTEGER NOT NULL, FiscalYear INTEGER NOT NULL, PayBasis TEXT NOT NULL, RegularGrossPaid FLOAT NOT NULL, FOREIGN KEY (PayBasis) REFERENCES PayBasis(PayBasis), FOREIGN KEY (PayRollNo) REFERENCES PayRoll(PayRollNo), FOREIGN KEY (Fiscalyear) REFERENCES FiscalYear(FiscalYear));"
create_table(conn, sql, drop_table_name='Employee')

##### Create table Designation.

In [12]:
sql = "CREATE TABLE Designation (EmployeeID INTEGER NOT NULL, TitleDescription TEXT NOT NULL, BaseSalary INTEGER NOT NULL, WorkLocation TEXT NOT NULL, FOREIGN KEY (EmployeeID) REFERENCES Employee(EmployeeID));"
create_table(conn, sql, drop_table_name='Designation')

##### Create table Income.

In [13]:
sql = "CREATE TABLE Income (EmployeeID INTEGER NOT NULL, RegularGrossPaid FLOAT NOT NULL, RegularHours FLOAT NOT NULL, OTHours FLOAT NOT NULL, TotalOTPay FLOAT NOT NULL, TotalOtherPay FLOAT NOT NULL, TotalPay FLOAT NOT NULL, HourlyPay FLOAT NOT NULL, RegularHoursCategory VARCHAR(255) NOT NULL, OTHoursCategory VARCHAR(255) NOT NULL, HourlyPayCategory VARCHAR(255) NOT NULL, TotalPayCategory VARCHAR(255) NOT NULL, FOREIGN KEY (EmployeeID) REFERENCES Employee(EmployeeID));"
create_table(conn, sql, drop_table_name='Income')

### Display database schema.

In [14]:
tables = ['FiscalYear','AgencyName','PayBasis','PayRoll','Employee','Designation','Income']
with conn:
    for t in tables:
        sql = f"select * from {t};"
        dff = pd.read_sql_query(sql, conn)
        print(f"{t} :")
        display(dff)
        print()

FiscalYear :


Unnamed: 0,FiscalYear



AgencyName :


Unnamed: 0,AgencyName



PayBasis :


Unnamed: 0,PayBasis



PayRoll :


Unnamed: 0,PayRollNo,AgencyName



Employee :


Unnamed: 0,EmployeeID,FirstName,LastName,PayRollNo,FiscalYear,PayBasis,RegularGrossPaid



Designation :


Unnamed: 0,EmployeeID,TitleDescription,BaseSalary,WorkLocation



Income :


Unnamed: 0,EmployeeID,RegularGrossPaid,RegularHours,OTHours,TotalOTPay,TotalOtherPay,TotalPay,HourlyPay,RegularHoursCategory,OTHoursCategory,HourlyPayCategory,TotalPayCategory





In [15]:
df.head(1)

Unnamed: 0,Fiscal Year,Payroll Number,Agency Name,Last Name,First Name,Agency Start Date,Work Location Borough,Title Description,Leave Status as of June 30,Base Salary,...,Regular Gross Paid,OT Hours,Total OT Paid,Total Other Pay,Regular Hours category,Total Paid,OT Hours category,Hourly Pay,Hourly Pay category,Total Pay category
0,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,BEREZIN,MIKHAIL,08/10/2015,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,...,84698.21,0.0,0.0,0.0,1500-2000,84698.21,0-500,46.537478,0-100,60000-90000


### Import data into tables

####  Values insertion for tables - FiscalYear, AgencyName, PayBasis.

In [16]:
FY = df["Fiscal Year"].value_counts().index.unique().sort_values(ascending=True)  # Fiscal Year
FY = [(i,) for i in FY] # Preparing for execute many.

AN = df["Agency Name"].value_counts().index.unique().sort_values(ascending=True)  # Agency Name
AN = [(j,) for j in AN]

PB = df["Pay Basis"].value_counts().index.unique().sort_values(ascending=True)  # Pay Basis
PB = [(k,) for k in PB]

t_dict = {"FiscalYear": FY, "AgencyName": AN, "PayBasis": PB}

with conn:
    cur_insert = conn.cursor()
    for t in tables[:3]:
        t = t.replace(" ","")
        sql = f"INSERT INTO {t} ({t}) VALUES (?);"
        cur_insert.executemany(sql, t_dict[t])

##### Display tables - FiscalYear, AgencyName, PayBasis.

In [17]:
with conn:
    for t in tables[:3]:
        sql = f"select * from {t};"
        dff = pd.read_sql_query(sql, conn)
        print(f"{t} :")
        display(dff)
        print()

FiscalYear :


Unnamed: 0,FiscalYear
0,2015
1,2016
2,2017
3,2018
4,2019
5,2020
6,2021



AgencyName :


Unnamed: 0,AgencyName
0,ADMIN FOR CHILDREN'S SVCS
1,ADMIN TRIALS AND HEARINGS
2,BOARD OF CORRECTION
3,BOARD OF ELECTION
4,BOARD OF ELECTION POLL WORKERS
...,...
151,STATEN ISLAND COMMUNITY BD #2
152,STATEN ISLAND COMMUNITY BD #3
153,TAX COMMISSION
154,TAXI & LIMOUSINE COMMISSION



PayBasis :


Unnamed: 0,PayBasis
0,Prorated Annual
1,per Annum
2,per Day
3,per Hour





#### Values insertion for table PayRoll

In [18]:
pay_roll_df = df[["Payroll Number", "Agency Name"]]
pay_roll_df = pay_roll_df.groupby(["Payroll Number","Agency Name"]).count().reset_index()

pay_roll_list = [tuple(row) for row in pay_roll_df.itertuples(index=False)]  # Creating list of tuples for executemany.

with conn:
    cur_insert = conn.cursor()
    sql = "INSERT INTO PayRoll (PayRollNo, AgencyName) VALUES (?, ?);"
    cur_insert.executemany(sql, pay_roll_list)

##### Display table PayRoll

In [19]:
with conn:
    sql = "select * from PayRoll;"
    dff = pd.read_sql_query(sql, conn)
    display(dff)

Unnamed: 0,PayRollNo,AgencyName
0,2,OFFICE OF THE MAYOR
1,3,BOARD OF ELECTION
2,4,CAMPAIGN FINANCE BOARD
3,8,OFFICE OF THE ACTUARY
4,9,NYC EMPLOYEES RETIREMENT SYS
...,...,...
151,942,PUBLIC ADMINISTRATOR-BRONX
152,943,PUBLIC ADMINISTRATOR-KINGS
153,944,PUBLIC ADMINISTRATOR-QUEENS
154,945,PUBLIC ADMINISTRATOR-RICHMOND


#### Values insertion for table Employee.

In [21]:
df.reset_index(inplace=True)
df.rename(columns={"index":"EmployeeID"}, inplace=True)
employee_df = df[["EmployeeID","First Name","Last Name","Payroll Number","Fiscal Year","Pay Basis","Regular Gross Paid"]]

employee_df_list = [tuple(row) for row in employee_df.itertuples(index=False)]  # Creating list of tuples for executemany.

with conn:
    cur_insert = conn.cursor()
    sql = "INSERT INTO Employee (EmployeeID,FirstName,LastName,PayRollNo,FiscalYear,PayBasis,RegularGrossPaid) VALUES (?,?,?,?,?,?,?);"
    cur_insert.executemany(sql, employee_df_list)

##### Display Employee table.

In [29]:
with conn:
    sql = "select * from Employee limit 6;"
    dff = pd.read_sql_query(sql, conn)
    display(dff)

Unnamed: 0,EmployeeID,FirstName,LastName,PayRollNo,FiscalYear,PayBasis,RegularGrossPaid
0,0,MIKHAIL,BEREZIN,17,2020,per Annum,84698.21
1,1,VERONICA,GEAGER,17,2020,per Annum,84698.21
2,2,SHRADDHA,RAMANI,17,2020,per Annum,84698.21
3,3,JONATHAN,ROTTA,17,2020,per Annum,84698.21
4,4,ROBERT,WILSON II,17,2020,per Annum,84698.21
5,5,MORIAH,WASHINGTON,17,2020,per Annum,87900.95


#### Values insertion for table Designation.

In [31]:
designation_df = df[["EmployeeID","Title Description","Base Salary","Work Location Borough"]]

designation_df_list = [tuple(row) for row in designation_df.itertuples(index=False)]  # Creating list of tuples for executemany.

with conn:
    cur_insert = conn.cursor()
    sql = "INSERT INTO Designation (EmployeeID,TitleDescription,BaseSalary,WorkLocation) VALUES (?,?,?,?);"
    cur_insert.executemany(sql, designation_df_list)

##### Display Designation table.

In [39]:
with conn:
    sql = "select * from Designation limit 5;"
    dff = pd.read_sql_query(sql, conn)
    display(dff)

Unnamed: 0,EmployeeID,TitleDescription,BaseSalary,WorkLocation
0,0,EMERGENCY PREPAREDNESS MANAGER,86005,BROOKLYN
1,1,EMERGENCY PREPAREDNESS MANAGER,86005,BROOKLYN
2,2,EMERGENCY PREPAREDNESS MANAGER,86005,BROOKLYN
3,3,EMERGENCY PREPAREDNESS MANAGER,86005,BROOKLYN
4,4,EMERGENCY PREPAREDNESS MANAGER,86005,BROOKLYN


#### Values insertion for table Income.

In [44]:
income_df = df[["EmployeeID","Regular Gross Paid","Regular Hours","OT Hours","Total OT Paid","Total Other Pay","Total Paid","Hourly Pay","Regular Hours category","OT Hours category","Hourly Pay category","Total Pay category"]]

income_df_list = [tuple(row) for row in income_df.itertuples(index=False)]  # Creating list of tuples for executemany.

with conn:
    cur_insert = conn.cursor()
    sql = "INSERT INTO Income (EmployeeID,RegularGrossPaid,RegularHours,OTHours,TotalOTPay,TotalOtherPay,TotalPay,HourlyPay,RegularHoursCategory,OTHoursCategory,HourlyPayCategory,TotalPayCategory) VALUES (?,?,?,?,?,?,?,?,?,?,?,?);"
    cur_insert.executemany(sql, income_df_list)

##### Display table Income.

In [47]:
with conn:
    sql = "select * from Income limit 5;"
    dff = pd.read_sql_query(sql, conn)
    display(dff)

Unnamed: 0,EmployeeID,RegularGrossPaid,RegularHours,OTHours,TotalOTPay,TotalOtherPay,TotalPay,HourlyPay,RegularHoursCategory,OTHoursCategory,HourlyPayCategory,TotalPayCategory
0,0,84698.21,1820.0,0.0,0.0,0.0,84698.21,46.537478,1500-2000,0-500,0-100,60000-90000
1,1,84698.21,1820.0,0.0,0.0,0.0,84698.21,46.537478,1500-2000,0-500,0-100,60000-90000
2,2,84698.21,1820.0,0.0,0.0,0.0,84698.21,46.537478,1500-2000,0-500,0-100,60000-90000
3,3,84698.21,1820.0,0.0,0.0,0.0,84698.21,46.537478,1500-2000,0-500,0-100,60000-90000
4,4,84698.21,1820.0,0.0,0.0,0.0,84698.21,46.537478,1500-2000,0-500,0-100,60000-90000
