### Importing Data and Necessary libraries

In [1]:
import pandas as pd
import sqlite3
import csv

### Importing csv files using Python

In [3]:
# Importing and converting cvs data files into dataframes  
df1 = pd.read_csv('HRData_Diversity.csv')
df2 = pd.read_csv('HRData_Employees.csv')
df3 = pd.read_csv('HRData_Manager.csv')
df4 = pd.read_csv('HRData_Positions.csv')

In [4]:
df5 = pd.read_csv('HRData_Positions.csv')

In [7]:
# Connecting to database 
conn = sqlite3.connect('hr_data_databasee.db')

In [8]:
# Adding dataframes to SQL database to use for analysis
df1.to_sql('HR_Diversity', conn, index=False,)
df2.to_sql('HR_Employees', conn, index=False)
df3.to_sql('HR_Manager', conn, index=False)
df4.to_sql('HR_Positions', conn, index=False)

29

In [9]:
# Load the SQL extension to execute SQL commands in this notebook cell
%load_ext sql

In [10]:
# Connect to the SQLite database named 'hr_database.db'
%sql sqlite:///hr_data_databasee.db

### Exploring the Data in our Database
- Understand the schema of our four tables
- Identify Primary and foreign keys in our tables

In [11]:
%%sql

-- Retrieve the names of tables in the SQLite database
SELECT name AS hr_database_tables
FROM sqlite_master
WHERE type = 'table';

 * sqlite:///hr_data_databasee.db
Done.


hr_database_tables
HR_Diversity
HR_Employees
HR_Manager
HR_Positions


In [12]:
%%sql

-- Show the schema of the table HR_Employees
PRAGMA table_info(HR_Employees);

 * sqlite:///hr_data_databasee.db
Done.


cid,name,type,notnull,dflt_value,pk
0,Employee_Name,TEXT,0,,0
1,EmpID,INTEGER,0,,0
2,DeptID,INTEGER,0,,0
3,Salary,INTEGER,0,,0
4,PositionID,INTEGER,0,,0
5,State,TEXT,0,,0
6,DOB,TEXT,0,,0
7,MaritalDesc,TEXT,0,,0
8,DateofHire,TEXT,0,,0
9,DateofTermination,TEXT,0,,0


In [13]:
%%sql

-- Show the schema of the table HR_Positions
PRAGMA table_info(HR_Positions);

 * sqlite:///hr_data_databasee.db
Done.


cid,name,type,notnull,dflt_value,pk
0,PositionID,INTEGER,0,,0
1,Position,TEXT,0,,0


In [14]:
%%sql

-- Show the schema of the table
PRAGMA table_info(HR_Manager);

 * sqlite:///hr_data_databasee.db
Done.


cid,name,type,notnull,dflt_value,pk
0,EmpID,INTEGER,0,,0
1,ManagerName,TEXT,0,,0
2,ManagerID,REAL,0,,0


In [15]:
%%sql

-- Show the schema of the table
PRAGMA table_info(HR_Diversity);

 * sqlite:///hr_data_databasee.db
Done.


cid,name,type,notnull,dflt_value,pk
0,EmpID,INTEGER,0,,0
1,Sex,TEXT,0,,0
2,FromDiversityJobFairID,TEXT,0,,0
3,HispanicLatino,TEXT,0,,0
4,RaceDesc,TEXT,0,,0


### Data Cleaning - Create a new table named HR_Dataset by combining data from multiple tables

In [16]:
%%sql

-- Create a new table named HR_Dataset by combining data from multiple tables
CREATE TABLE HR_Dataset AS

SELECT *
FROM HR_Employees e
inner JOIN HR_Positions p ON e.positionID = p.positionID
inner JOIN HR_Manager m on e.empid = m.empid
inner JOIN HR_Diversity d on e.empid = d.empid;

 * sqlite:///hr_data_databasee.db
Done.


[]

In [17]:
%%sql 
--
Select * from HR_Dataset limit 1;

 * sqlite:///hr_data_databasee.db
Done.


Employee_Name,EmpID,DeptID,Salary,PositionID,State,DOB,MaritalDesc,DateofHire,DateofTermination,ManagerID,Department,RecruitmentSource,PerformanceScore,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,Absences,PositionID:1,Position,EmpID:1,ManagerName,ManagerID:1,EmpID:2,Sex,FromDiversityJobFairID,HispanicLatino,RaceDesc
"Adinolfi, Wilson K",10026,5,62506,19,MA,1983-07-10,Single,2011-07-05,,22.0,Production,LinkedIn,Exceeds,4.6,5,0,1,19,Production Technician I,10026,Michael Albert,22.0,10026,M,No,No,White


### Ensuring new HR_Dataset has been added to database

In [18]:
%%sql

-- Retrieve the names of tables in the SQLite database
SELECT name AS hr_database_tables
FROM sqlite_master
WHERE type = 'table';

 * sqlite:///hr_data_databasee.db
Done.


hr_database_tables
HR_Diversity
HR_Employees
HR_Manager
HR_Positions
HR_Dataset


### Data Cleaning - Dropping Duplicate Columns from HR_Dataset table
- Remove Redundant Columns from HR_Dataset Table that were added when joining the 4 tables in the database

In [19]:
%%sql 

-- Remove the column 'positionid:1' from the HR_Dataset table
ALTER TABLE HR_Dataset
DROP COLUMN 'positionid:1';

-- Remove the column 'empid:1' from the HR_Dataset table
ALTER TABLE HR_Dataset
DROP COLUMN 'empid:1';

-- Remove the column 'managerid:1' from the HR_Dataset table
ALTER TABLE HR_Dataset
DROP COLUMN 'managerid:1';

-- Remove the column 'empid:2' from the HR_Dataset table
ALTER TABLE HR_Dataset
DROP COLUMN 'empid:2';

 * sqlite:///hr_data_databasee.db
Done.
Done.
Done.
Done.


[]

In [20]:
%%sql 
-- Display the modified table
select * from hr_dataset limit 1;

 * sqlite:///hr_data_databasee.db
Done.


Employee_Name,EmpID,DeptID,Salary,PositionID,State,DOB,MaritalDesc,DateofHire,DateofTermination,ManagerID,Department,RecruitmentSource,PerformanceScore,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,Absences,Position,ManagerName,Sex,FromDiversityJobFairID,HispanicLatino,RaceDesc
"Adinolfi, Wilson K",10026,5,62506,19,MA,1983-07-10,Single,2011-07-05,,22.0,Production,LinkedIn,Exceeds,4.6,5,0,1,Production Technician I,Michael Albert,M,No,No,White


### Data Cleaning - Changing Data Types

- Updating data type from text to date for DOB, DateofHire, and DateofTermination columns

In [21]:
%%sql

-- Add a new column 'date_DOB' to the table and update column to a date data type 
ALTER TABLE HR_Dataset ADD COLUMN date_DOB date;

UPDATE HR_Dataset
SET date_DOB = strftime('%Y-%m-%d', DOB);


-- Add a new column 'date_hired' to the table and update column to a date data type
ALTER TABLE HR_Dataset ADD COLUMN date_hired date;

UPDATE HR_Dataset
SET date_hired = strftime('%Y-%m-%d', dateofhire);


-- Add a new column 'date_term' to the table and update column to a date data type
ALTER TABLE HR_Dataset ADD COLUMN date_term date;

UPDATE HR_Dataset
SET date_term = strftime('%Y-%m-%d', dateoftermination);


-- Display the modified table with three added columns
SELECT * FROM HR_Dataset

LIMIT 1;

 * sqlite:///hr_data_databasee.db
Done.
311 rows affected.
Done.
311 rows affected.
Done.
311 rows affected.
Done.


Employee_Name,EmpID,DeptID,Salary,PositionID,State,DOB,MaritalDesc,DateofHire,DateofTermination,ManagerID,Department,RecruitmentSource,PerformanceScore,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,Absences,Position,ManagerName,Sex,FromDiversityJobFairID,HispanicLatino,RaceDesc,date_DOB,date_hired,date_term
"Adinolfi, Wilson K",10026,5,62506,19,MA,1983-07-10,Single,2011-07-05,,22.0,Production,LinkedIn,Exceeds,4.6,5,0,1,Production Technician I,Michael Albert,M,No,No,White,1983-07-10,2011-07-05,


### Data Manipulation - Replacing Null values from date of termination
- Replacing all Null values from the date_term column as current date to create a column for display employee tenure

In [22]:
%%sql

UPDATE hr_dataset

SET date_term = CURRENT_DATE
WHERE date_term IS NULL;

 * sqlite:///hr_data_databasee.db
207 rows affected.


[]

In [23]:
%%sql 

-- Display the modified table
SELECT * FROM HR_Dataset LIMIT 1;

 * sqlite:///hr_data_databasee.db
Done.


Employee_Name,EmpID,DeptID,Salary,PositionID,State,DOB,MaritalDesc,DateofHire,DateofTermination,ManagerID,Department,RecruitmentSource,PerformanceScore,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,Absences,Position,ManagerName,Sex,FromDiversityJobFairID,HispanicLatino,RaceDesc,date_DOB,date_hired,date_term
"Adinolfi, Wilson K",10026,5,62506,19,MA,1983-07-10,Single,2011-07-05,,22.0,Production,LinkedIn,Exceeds,4.6,5,0,1,Production Technician I,Michael Albert,M,No,No,White,1983-07-10,2011-07-05,2023-09-21


### Data Manipulation - Adding Employee Tenure Column

In [24]:
%%sql 

ALTER TABLE HR_Dataset
ADD COLUMN Tenure INTEGER;

UPDATE HR_Dataset
SET Tenure = CAST((julianday(date_term) - julianday(date_hired)) / 365.25 AS INTEGER);

 * sqlite:///hr_data_databasee.db
Done.
311 rows affected.


[]

In [25]:
%%sql 

-- Display the modified table
SELECT * FROM HR_Dataset LIMIT 1;

 * sqlite:///hr_data_databasee.db
Done.


Employee_Name,EmpID,DeptID,Salary,PositionID,State,DOB,MaritalDesc,DateofHire,DateofTermination,ManagerID,Department,RecruitmentSource,PerformanceScore,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,Absences,Position,ManagerName,Sex,FromDiversityJobFairID,HispanicLatino,RaceDesc,date_DOB,date_hired,date_term,Tenure
"Adinolfi, Wilson K",10026,5,62506,19,MA,1983-07-10,Single,2011-07-05,,22.0,Production,LinkedIn,Exceeds,4.6,5,0,1,Production Technician I,Michael Albert,M,No,No,White,1983-07-10,2011-07-05,2023-09-21,12


### Data Manipulation
- Added a new column 'Married_id' to the HR_Employees table 
- Update 'Married_id' values based on 'MaritalDesc' values

In [26]:
%%sql

ALTER TABLE HR_Dataset
ADD COLUMN Married_id INTEGER;

UPDATE HR_Dataset
SET Married_id = CASE
    WHEN MaritalDesc = 'Single' THEN 1
    WHEN MaritalDesc = 'Widowed' THEN 1
    WHEN MaritalDesc = 'Divorced' THEN 1
    WHEN MaritalDesc = 'Married' THEN 0
    ELSE NULL
END;

 * sqlite:///hr_data_databasee.db
Done.
311 rows affected.


[]

In [27]:
%%sql 

-- Display the modified table
SELECT * FROM HR_Dataset LIMIT 1;

 * sqlite:///hr_data_databasee.db
Done.


Employee_Name,EmpID,DeptID,Salary,PositionID,State,DOB,MaritalDesc,DateofHire,DateofTermination,ManagerID,Department,RecruitmentSource,PerformanceScore,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,Absences,Position,ManagerName,Sex,FromDiversityJobFairID,HispanicLatino,RaceDesc,date_DOB,date_hired,date_term,Tenure,Married_id
"Adinolfi, Wilson K",10026,5,62506,19,MA,1983-07-10,Single,2011-07-05,,22.0,Production,LinkedIn,Exceeds,4.6,5,0,1,Production Technician I,Michael Albert,M,No,No,White,1983-07-10,2011-07-05,2023-09-21,12,1


### Data Manipulation
- Added a new column 'PerfScore_ID' to the HR_Employees table 
- Update 'PerfScore_ID' values based on 'PerformanceScore' values

In [28]:
%%sql

ALTER TABLE HR_Dataset
ADD COLUMN PerfScore_ID INTEGER;

UPDATE HR_Dataset
SET PerfScore_ID = CASE
    WHEN PerformanceScore = 'Exceeds' THEN 4
    WHEN PerformanceScore = 'Fully Meets' THEN 3
    WHEN PerformanceScore = 'Needs Improvement' THEN 2
    WHEN PerformanceScore = 'PIP' THEN 1
    ELSE NULL
END;

 * sqlite:///hr_data_databasee.db
Done.
311 rows affected.


[]

In [29]:
%%sql 

-- Display the modified table
SELECT * FROM HR_Dataset LIMIT 1;

 * sqlite:///hr_data_databasee.db
Done.


Employee_Name,EmpID,DeptID,Salary,PositionID,State,DOB,MaritalDesc,DateofHire,DateofTermination,ManagerID,Department,RecruitmentSource,PerformanceScore,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,Absences,Position,ManagerName,Sex,FromDiversityJobFairID,HispanicLatino,RaceDesc,date_DOB,date_hired,date_term,Tenure,Married_id,PerfScore_ID
"Adinolfi, Wilson K",10026,5,62506,19,MA,1983-07-10,Single,2011-07-05,,22.0,Production,LinkedIn,Exceeds,4.6,5,0,1,Production Technician I,Michael Albert,M,No,No,White,1983-07-10,2011-07-05,2023-09-21,12,1,4


### Data Manipulation
- Added a new column 'DiversityJobFairScore' to the HR_Dataset table 
- Updated score values as 1 if employees were hired in a Diversity Job Fair and 0 if not 

In [30]:
%%sql 

ALTER TABLE HR_Dataset
ADD COLUMN DiversityJobFairScore INTEGER;

UPDATE HR_Dataset
SET DiversityJobFairScore = CASE
    WHEN FromDiversityJobFairID = 'Yes' THEN 1
    ELSE 0
END;

 * sqlite:///hr_data_databasee.db
Done.
311 rows affected.


[]

In [31]:
%%sql 

-- Display the modified table
SELECT * FROM HR_Dataset LIMIT 1;

 * sqlite:///hr_data_databasee.db
Done.


Employee_Name,EmpID,DeptID,Salary,PositionID,State,DOB,MaritalDesc,DateofHire,DateofTermination,ManagerID,Department,RecruitmentSource,PerformanceScore,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,Absences,Position,ManagerName,Sex,FromDiversityJobFairID,HispanicLatino,RaceDesc,date_DOB,date_hired,date_term,Tenure,Married_id,PerfScore_ID,DiversityJobFairScore
"Adinolfi, Wilson K",10026,5,62506,19,MA,1983-07-10,Single,2011-07-05,,22.0,Production,LinkedIn,Exceeds,4.6,5,0,1,Production Technician I,Michael Albert,M,No,No,White,1983-07-10,2011-07-05,2023-09-21,12,1,4,0


### Data Manipulation
- Added a new column 'Date_Term' to the HR_Dataset table 
- Update values based on 'dateoftermination' column, 1 if employee had a termination date and 0 if date was Null

In [32]:
%%sql 

UPDATE HR_Dataset
SET date_term = CASE
    WHEN dateoftermination IS NULL THEN 0 
    ELSE 1
END;

 * sqlite:///hr_data_databasee.db
311 rows affected.


[]

In [33]:
%%sql 

-- Display the modified table
SELECT * FROM HR_Dataset LIMIT 1;

 * sqlite:///hr_data_databasee.db
Done.


Employee_Name,EmpID,DeptID,Salary,PositionID,State,DOB,MaritalDesc,DateofHire,DateofTermination,ManagerID,Department,RecruitmentSource,PerformanceScore,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,Absences,Position,ManagerName,Sex,FromDiversityJobFairID,HispanicLatino,RaceDesc,date_DOB,date_hired,date_term,Tenure,Married_id,PerfScore_ID,DiversityJobFairScore
"Adinolfi, Wilson K",10026,5,62506,19,MA,1983-07-10,Single,2011-07-05,,22.0,Production,LinkedIn,Exceeds,4.6,5,0,1,Production Technician I,Michael Albert,M,No,No,White,1983-07-10,2011-07-05,0,12,1,4,0


### Export clean HR_Dataset table as a CVS file

In [34]:
# Query the hr_dataset table using the %sql magic command
result = %sql SELECT * FROM hr_dataset

# Convert the result to a DataFrame
df = result.DataFrame()

# Specify the CSV file path
csv_file_path = 'hr_dataset.csv'

# Export the DataFrame to a CSV file
df.to_csv(csv_file_path, index=False)

 * sqlite:///hr_data_databasee.db
Done.


In [35]:
%%sql

--CREATE TABLE HR_Dataset AS

SELECT *
FROM hr_dataset;

 * sqlite:///hr_data_databasee.db
Done.


Employee_Name,EmpID,DeptID,Salary,PositionID,State,DOB,MaritalDesc,DateofHire,DateofTermination,ManagerID,Department,RecruitmentSource,PerformanceScore,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,Absences,Position,ManagerName,Sex,FromDiversityJobFairID,HispanicLatino,RaceDesc,date_DOB,date_hired,date_term,Tenure,Married_id,PerfScore_ID,DiversityJobFairScore
"Adinolfi, Wilson K",10026,5,62506,19,MA,1983-07-10,Single,2011-07-05,,22.0,Production,LinkedIn,Exceeds,4.6,5,0,1,Production Technician I,Michael Albert,M,No,No,White,1983-07-10,2011-07-05,0,12,1.0,4,0
"Ait Sidi, Karthikeyan",10084,3,104437,27,MA,1975-05-05,Married,2015-03-30,2016-06-16,4.0,IT/IS,Indeed,Fully Meets,4.96,3,6,17,Sr. DBA,Simon Roup,M,No,No,White,1975-05-05,2015-03-30,1,1,0.0,3,0
"Akinkuolie, Sarah",10196,5,64955,20,MA,1988-09-19,Married,2011-07-05,2012-09-24,20.0,Production,LinkedIn,Fully Meets,3.02,3,0,3,Production Technician II,Kissy Sullivan,F,No,No,White,1988-09-19,2011-07-05,1,1,0.0,3,0
"Alagbe,Trina",10088,5,64991,19,MA,1988-09-27,Married,2008-01-07,,16.0,Production,Indeed,Fully Meets,4.84,5,0,15,Production Technician I,Elijiah Gray,F,No,No,White,1988-09-27,2008-01-07,0,15,0.0,3,0
"Anderson, Carol",10069,5,50825,19,MA,1989-09-08,Divorced,2011-07-11,2016-09-06,39.0,Production,Google Search,Fully Meets,5.0,4,0,2,Production Technician I,Webster Butler,F,No,No,White,1989-09-08,2011-07-11,1,5,1.0,3,0
"Anderson, Linda",10002,5,57568,19,MA,1977-05-22,Single,2012-01-09,,11.0,Production,LinkedIn,Exceeds,5.0,5,0,15,Production Technician I,Amy Dunn,F,No,No,White,1977-05-22,2012-01-09,0,11,1.0,4,0
"Andreola, Colby",10194,4,95660,24,MA,1979-05-24,Single,2014-11-10,,10.0,Software Engineering,LinkedIn,Fully Meets,3.04,3,4,19,Software Engineer,Alex Sweetwater,F,No,No,White,1979-05-24,2014-11-10,0,8,1.0,3,0
"Athwal, Sam",10062,5,59365,19,MA,1983-02-18,Widowed,2013-09-30,,19.0,Production,Employee Referral,Fully Meets,5.0,4,0,19,Production Technician I,Ketsia Liebig,M,No,No,White,1983-02-18,2013-09-30,0,9,1.0,3,0
"Bachiochi, Linda",10114,5,47837,19,MA,1970-02-11,Single,2009-07-06,,12.0,Production,Diversity Job Fair,Fully Meets,4.46,3,0,4,Production Technician I,Brannon Miller,F,Yes,No,Black or African American,1970-02-11,2009-07-06,0,14,1.0,3,1
"Bacong, Alejandro",10250,3,50178,14,MA,1988-01-07,Divorced,2015-01-05,,7.0,IT/IS,Indeed,Fully Meets,5.0,5,6,16,IT Support,Peter Monroe,M,No,No,White,1988-01-07,2015-01-05,0,8,1.0,3,0


In [36]:
%%sql

CREATE TABLE HR_Datav1 AS

SELECT  
    EmpID,
    PerfScore_ID,
    Salary,
    date_term AS Fired,
    EmpSatisfaction,
    SpecialProjectsCount,
    Absences
FROM hr_dataset;

 * sqlite:///hr_data_databasee.db
Done.


[]

In [37]:
# Query the hr_dataset table using the %sql magic command
result = %sql SELECT * FROM hr_datav1

# Convert the result to a DataFrame
df = result.DataFrame()

# Specify the CSV file path
csv_file_path = 'hr_datav1.csv'

# Export the DataFrame to a CSV file
df.to_csv(csv_file_path, index=False)

 * sqlite:///hr_data_databasee.db
Done.
