# Laden der Employee Daten (ETL_RecordStartDate)

Erzeugt die Employee Daten für die Übung und erstellt eine SQL Server Datenbank mit der Tabelle *Employee*.
Voraussetzung ist ein SQL Server Container:

*docker run -d -p 1433:1433  --name sqlserver2019 -e "ACCEPT_EULA=Y" -e "SA_PASSWORD=SqlServer2019" mcr.microsoft.com/azure-sql-edge*

Quelle: http://www.griesmayer.com/?menu=Business%20Intelligence&semester=Semsester_6&topic=02_ETL_RecordStartDate

In [1]:
from pathlib import Path
from faker import Faker
import sqlalchemy, re
import pandas as pd, numpy as np

database = "Employees"
connection_url = sqlalchemy.engine.URL.create("mssql+pyodbc", username="sa",
    password="SqlServer2019", host="localhost", database=database,
    query={ "driver": "ODBC Driver 18 for SQL Server" })
# We cannot connect to sales to create the database (does not exist at this time). We use tempdb.
# Autocommit is necessary for create database and ddl statements.
tempdb_engine = sqlalchemy.create_engine(
    connection_url.set(database="tempdb"), isolation_level="AUTOCOMMIT", 
    connect_args={"TrustServerCertificate": "yes"})
# We drop the database just before connecting, so we set pool_pre_ping=True
engine = sqlalchemy.create_engine(
    connection_url, fast_executemany=True, pool_pre_ping=True,
    connect_args={"TrustServerCertificate": "yes"})


Zuerst löschen wir die Datenbank und erstellen sie neu.
Das ist natürlich nur zum Testen, sonst ist das Löschen der Datenbank nicht ideal...

In [2]:
with tempdb_engine.connect() as conn: 
    try: conn.execute(sqlalchemy.text(f"ALTER DATABASE {database} SET SINGLE_USER WITH ROLLBACK IMMEDIATE"))
    except: pass
    conn.execute(sqlalchemy.text(f"DROP DATABASE IF EXISTS {database}"))
    conn.execute(sqlalchemy.text(f"CREATE DATABASE {database}"))
with engine.connect() as conn:
    conn.execution_options(isolation_level="AUTOCOMMIT")
    conn.execute(sqlalchemy.text("""
        CREATE TABLE Employee
        (
            EmployeeID           INTEGER,
            RecordStartDate      DATE,
            RecordEndDate        DATE,
            FirstName            VARCHAR(20),
            LastName             VARCHAR(20),
            Department           VARCHAR(20),
            Salary               DECIMAL(10,2),
            PRIMARY KEY (EmployeeID, RecordStartDate)
        )  
    """))

Zuerst werden Fakedaten in die Dateien *employee_20170225.txt*, *employee_20170226.txt*, *employee_20170228.txt* und *employee_20170228.txt* geschrieben.

In [3]:
Faker.seed(854)  # With a fixed seed, the same values will be generated in each run.
np.random.seed(854)
faker = Faker(locale="de_DE")
departments = ["Sales", "IT", "Logistics", "Customer service"]
employees = pd.DataFrame(map(lambda i: {
    "EmployeeID": i,
    "FirstName": faker.first_name(),
    "LastName": faker.last_name(),
    "Department": np.random.choice(departments[0:3]),    # Customer service is not selected.
    "Salary": round(np.random.normal(3000, 100), 2)
}, range(1,11))).set_index("EmployeeID")

# Select employees for the daily text files
employees_20170225 = employees.loc[[1,2,3,4,9]]
employees_20170226 = employees.loc[[2,3,4,6,9]]
employees_20170227 = employees.loc[[2,3,4,6,7,9]]
employees_20170228 = employees.loc[[3,4,6,7,8,9]]

# Do some mutations
employees_20170226.loc[3, "Salary"] += 100
employees_20170226.loc[2, "Department"] = "Customer service"
employees_20170227.loc[6, "Salary"] -= 100
employees_20170228.loc[3, "Salary"] -= 100

# Write daily files. Set lineterminator to CRLF (important on macOS)
employees_20170225.to_csv("employee_20170225.txt", encoding="utf-8", sep="\t", lineterminator="\r\n")
employees_20170226.to_csv("employee_20170226.txt", encoding="utf-8", sep="\t", lineterminator="\r\n")
employees_20170227.to_csv("employee_20170227.txt", encoding="utf-8", sep="\t", lineterminator="\r\n")
employees_20170228.to_csv("employee_20170228.txt", encoding="utf-8", sep="\t", lineterminator="\r\n")

Nun werden alle Dateien, die mit dem Namen *employee_* beginnen, gelesen.

In [4]:
def read_customers(filename):
    # Extract date of file from filename with regular expression.
    match = re.search(r"_(?P<date>\d{8}).txt", filename)
    if match is None: raise Exception(f"Invalid filename: {filename}, no date found.")
    filedate = pd.to_datetime(match.group("date"), format="%Y%m%d")
    # Load file into dataframe
    employees = pd.read_csv(filename, sep="\t", encoding='utf-8',
        dtype={"EmployeeID": int, "FirstName": "string", "LastName": "string",
                "Department": "string", "Salary": float})
    employees["RecordStartDate"] = filedate
    return (filedate, employees)

In [5]:
filenames = sorted(map(str, Path(".").glob("employee_*.txt")))
for filename in filenames:
    print(f"Import {filename}...")
    filedate, customers_new = read_customers(filename)
    with engine.connect() as conn:
        # Write the content of the new text file to temp table
        customers_new.to_sql("Employee_TMP", conn, if_exists="replace", index=False)
        conn.commit()
        # Insert missing customers or customers who have changed one of the fields.
        conn.execute(sqlalchemy.text("""
            INSERT INTO Employee
            SELECT EmployeeID, RecordStartDate, '9999-12-31', FirstName, LastName, Department, Salary
            FROM Employee_TMP tmp
            WHERE NOT EXISTS (SELECT * FROM Employee e
                WHERE 
                    e.EmployeeID = tmp.EmployeeID AND e.FirstName = tmp.FirstName AND
                    e.LastName = tmp.LastName AND e.Department = tmp.Department AND e.Salary = tmp.Salary)
        """))
        conn.commit()
        # Update the enddate if we have inserted a second record (case 1) or if the customer is not
        # present in the textfile (case 2).
        conn.execute(sqlalchemy.text("""
            UPDATE e SET e.RecordEndDate = :enddate
            FROM Employee e
            WHERE e.RecordEndDate = '9999-12-31' AND (
                NOT EXISTS (SELECT * FROM Employee_TMP tmp WHERE tmp.EmployeeID = e.EmployeeID) OR
                EXISTS (SELECT * FROM Employee e2 WHERE e2.EmployeeID = e.EmployeeID AND e2.RecordStartDate > e.RecordStartDate))
        """), { "enddate": filedate - pd.Timedelta(1, "day") })
        conn.commit()



Import employee_20170225.txt...
Import employee_20170226.txt...
Import employee_20170227.txt...
Import employee_20170228.txt...


## Prüfen des Importes

Die Employee Tabelle lesen:

In [6]:
with engine.connect() as conn:
    customer_check = pd.read_sql(sqlalchemy.text("SELECT * FROM Employee"), conn)
print(f"{len(customer_check)} Datensätze gelesen.")
display(customer_check.sort_values(["EmployeeID", "RecordStartDate"]))

12 Datensätze gelesen.


Unnamed: 0,EmployeeID,RecordStartDate,RecordEndDate,FirstName,LastName,Department,Salary
0,1,2017-02-25,2017-02-25,Silvia,Wesack,Sales,3080.88
1,2,2017-02-25,2017-02-25,Nikolaus,Käster,Logistics,2974.13
2,2,2017-02-26,2017-02-27,Nikolaus,Käster,Customer service,2974.13
3,3,2017-02-25,2017-02-25,Isabelle,Zorbach,Sales,3031.36
4,3,2017-02-26,2017-02-27,Isabelle,Zorbach,Sales,3131.36
5,3,2017-02-28,9999-12-31,Isabelle,Zorbach,Sales,2931.36
6,4,2017-02-25,9999-12-31,Alida,Römer,IT,3058.16
7,6,2017-02-26,2017-02-26,Ellinor,Rogge,Logistics,2910.96
8,6,2017-02-27,9999-12-31,Ellinor,Rogge,Logistics,2810.96
9,7,2017-02-27,9999-12-31,Danielle,Schmidtke,Sales,3095.95
