# Data Engineering
## Import the CSV files

In [1]:
import pandas as pd
import numpy as np

titles = pd.read_csv('../data/titles.csv')
employees = pd.read_csv('../data/employees.csv')
departments = pd.read_csv('../data/departments.csv')
salaries = pd.read_csv('../data/salaries.csv')
dept_emp = pd.read_csv('../data/dept_emp.csv')
dept_manager = pd.read_csv('../data/dept_manager.csv')

## Determine the primary keys and data relationships
### `titles`

In [2]:
# Since the count of `title_id` rows is equal to the number of unique `title_id` rows,
# all `title_id` rows are unique and can be used as primary keys.
titles.title_id.count() == titles.title_id.nunique()

True

### `employees`

In [3]:
# Since the count of `emp_no` rows is equal to the number of unique `emp_no` rows, 
# all `emp_no` rows are unique and can be used as primary keys.
employees.emp_no.count() == employees.emp_no.nunique()

True

### `departments`

In [4]:
# Since the count of `dept_no` rows is equal to the number of unique `dept_no` rows,
# all `dept_no` rows are unique and can be used as primary keys.
departments.dept_no.count() == departments.dept_no.nunique()

True

### `salaries`

In [5]:
# Since the number of `emp_no` rows is equal to the number of unique `emp_no` rows,
# all `emp_no` rows are unique and can be used as primary keys.
salaries.emp_no.count() == salaries.emp_no.nunique()

True

In [6]:
# Since `salaries` and `employees`, which both use `emp-no` as primary keys, have matching rows,
# they have a one-to-one relationship.
np.array_equal(salaries.emp_no.sort_values(), employees.emp_no.sort_values())

True

### `dept_emp`

In [7]:
# The count of rows is greater than the number of unique values for either `emp_no` or `dept_no`,
# indicating presence of non-unique values.
print(dept_emp.emp_no.count() > dept_emp.emp_no.nunique())
print(dept_emp.dept_no.count() > dept_emp.dept_no.nunique())

# However, when combined, they uniquely identify each row, so they can be used as composite keys.
print(dept_emp.groupby(['emp_no','dept_no']).size().apply(lambda x:x>1).any())

True
True
False


### `dept_manager`

In [8]:
# Since the number of `emp_no` rows is equal to the number of unique `emp_no` rows,
# all `emp_no` rows are unique and can be used as primary keys.
dept_manager.emp_no.count() == dept_manager.emp_no.nunique()

True

In [9]:
# Note that `dept_manager` is reduced from `dept_emp` to include `emp_no` only for managers.
all(dept_manager.emp_no.isin(dept_emp.emp_no))

True

## Determine the value lengths of attributes

In [10]:
def check_length(attribute):
    '''
    This function prints the value length of an attribute.
    If the value length is fixed, it prints out the fixed number of characters.
    If the value length varies, it prints out the maximum number of characters.
    '''
    if attribute.map(len).nunique() == 1:
        print(f'"{attribute.name}" has a fixed length of {attribute.map(len).unique()[0]}.')
    else:
        print(f'The length for "{attribute.name}" varies, with a max length of {attribute.map(len).max()}.')

In [11]:
# titles
check_length(titles.title_id)
check_length(titles.title)

"title_id" has a fixed length of 5.
The length for "title" varies, with a max length of 18.


In [12]:
# employees
check_length(employees.emp_title_id)
check_length(employees.first_name)
check_length(employees.last_name)

"emp_title_id" has a fixed length of 5.
The length for "first_name" varies, with a max length of 14.
The length for "last_name" varies, with a max length of 16.


In [13]:
# departments
check_length(departments.dept_no)
check_length(departments.dept_name)

"dept_no" has a fixed length of 4.
The length for "dept_name" varies, with a max length of 18.
