# First exercise for the etl course

## Importing the datasets

In [7]:
import yaml
import psycopg2
from psycopg2 import sql
from sqlalchemy import create_engine, text
import pandas as pd
import os

In [8]:
def load_config(file_path="config.yaml"):
    with open(file_path,"r") as file:
        return yaml.safe_load(file)
        

Loading the dataset from file in local, and also checking the lenght of characters for the creation of the database

In [9]:
print(os.getcwd())
dataset = pd.read_csv("dataset/candidates.csv",sep=";")
for column in dataset.columns:
    max_len = 0
    for i in dataset[column].astype(str):
        if len(i) > max_len:
            max_len = len(i)
    print(f"Max length in column '{column}': {max_len}")

/Users/santiagoaristizabal/Learning/python/etl_course/Exercise
Max length in column 'First Name': 11
Max length in column 'Last Name': 13
Max length in column 'Email': 36
Max length in column 'Application Date': 10
Max length in column 'Country': 51
Max length in column 'YOE': 2
Max length in column 'Seniority': 9
Max length in column 'Technology': 39
Max length in column 'Code Challenge Score': 2
Max length in column 'Technical Interview Score': 2


Changing the names of the file to be able to connect it to the database created

In [10]:
dataset.rename(columns={
    "First Name":"name",
    "Last Name":"lastname",
    "Email":"email",
    "Application Date":"applicationdate",
    "Country":"country",
    "YOE":"yoe",
    "Seniority":"seniority",
    "Technology":"technology",
    "Code Challenge Score":"codescore",
    "Technical Interview Score":"interviewscore",
}, inplace=True)

In [12]:
config = load_config()
db_config = config["database"]

db_user = db_config["user"]
db_password = db_config["password"]
db_host = db_config["host"]
db_port = db_config["port"]
db_name = db_config["name"]

conn = psycopg2.connect(
    dbname="postgres",
    user=db_user,
    password = db_password,
    host = db_host,
    port = db_port
)
conn.autocommit = True

In [13]:
try:
    with conn.cursor() as cur:
        cur.execute(sql.SQL("CREATE DATABASE {}").format(sql.Identifier(db_name)))
        print(f"Base de datos '{db_name}' creada exitosamente")
except psycopg2.errors.DuplicateDatabase:
    print(f"La base de datos '{db_name}' ya existe.")
finally:
    conn.close()

La base de datos 'candidates_etl_2' ya existe.


Creating the database

In [14]:
engine = create_engine(f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}")

with engine.connect() as conn:
    conn.execute(text("""
        CREATE TABLE IF NOT EXISTS candidates_etl_2 (
            id SERIAL PRIMARY KEY,
            name VARCHAR(50),
            lastname VARCHAR(50),
            email VARCHAR(50),
            applicationdate DATE,
            country VARCHAR(100),
            yoe INT,
            seniority VARCHAR(20),
            technology VARCHAR(50),
            codescore INT,
            interviewscore INT           
        );
    """))
    conn.commit()
    print(f"{db_name} creada exitosamente")

candidates_etl_2 creada exitosamente


Loading the csv to the postgres database

In [15]:
with engine.connect() as conn:
    dataset.to_sql("candidates_etl_2",con=engine, if_exists="append", index=False)
print("Base de datos montada a sql")

Base de datos montada a sql


Loading the 10 first data from the database

In [18]:
with engine.connect() as conn:
    result = conn.execute(text("SELECT * FROM candidates_etl_2;"))
    rows = result.fetchall()

print("Datos en 'candidates_etl_2' :")
num = 0
for row in rows:
    print(row)
    num += 1
    if num > 10:
        break

Datos en 'candidates_etl_2' :
(1, 'Bernadette', 'Langworth', 'leonard91@yahoo.com', datetime.date(2021, 2, 26), 'Norway', 2, 'Intern', 'Data Engineer', 3, 3)
(2, 'Camryn', 'Reynolds', 'zelda56@hotmail.com', datetime.date(2021, 9, 9), 'Panama', 10, 'Intern', 'Data Engineer', 2, 10)
(3, 'Larue', 'Spinka', 'okey_schultz41@gmail.com', datetime.date(2020, 4, 14), 'Belarus', 4, 'Mid-Level', 'Client Success', 10, 9)
(4, 'Arch', 'Spinka', 'elvera_kulas@yahoo.com', datetime.date(2020, 10, 1), 'Eritrea', 25, 'Trainee', 'QA Manual', 7, 1)
(5, 'Larue', 'Altenwerth', 'minnie.gislason@gmail.com', datetime.date(2020, 5, 20), 'Myanmar', 13, 'Mid-Level', 'Social Media Community Management', 9, 7)
(6, 'Alec', 'Abbott', 'juanita_hansen@gmail.com', datetime.date(2019, 8, 17), 'Zimbabwe', 8, 'Junior', 'Adobe Experience Manager', 2, 9)
(7, 'Allison', 'Jacobs', 'alba_rolfson27@yahoo.com', datetime.date(2018, 5, 18), 'Wallis and Futuna', 19, 'Trainee', 'Sales', 2, 9)
(8, 'Nya', 'Skiles', 'madisen.zulauf@gmail

In [16]:
with engine.connect() as conn:
    raw_df = pd.read_sql("SELECT * FROM candidates_etl_2", conn)

In [17]:
raw_df.head()

Unnamed: 0,id,name,lastname,email,applicationdate,country,yoe,seniority,technology,codescore,interviewscore
0,1,Bernadette,Langworth,leonard91@yahoo.com,2021-02-26,Norway,2,Intern,Data Engineer,3,3
1,2,Camryn,Reynolds,zelda56@hotmail.com,2021-09-09,Panama,10,Intern,Data Engineer,2,10
2,3,Larue,Spinka,okey_schultz41@gmail.com,2020-04-14,Belarus,4,Mid-Level,Client Success,10,9
3,4,Arch,Spinka,elvera_kulas@yahoo.com,2020-10-01,Eritrea,25,Trainee,QA Manual,7,1
4,5,Larue,Altenwerth,minnie.gislason@gmail.com,2020-05-20,Myanmar,13,Mid-Level,Social Media Community Management,9,7


Creating the hired column

In [61]:
df_transform = raw_df.copy()
df_transform['hired'] = df_transform.apply(
    lambda columna: "YES" if columna["codescore"] >= 7 and columna["interviewscore"] >= 7 else "NO",axis=1
)

df_transform.head()

Unnamed: 0,id,name,lastname,email,applicationdate,country,yoe,seniority,technology,codescore,interviewscore,hired
0,1,Bernadette,Langworth,leonard91@yahoo.com,2021-02-26,Norway,2,Intern,Data Engineer,3,3,NO
1,2,Camryn,Reynolds,zelda56@hotmail.com,2021-09-09,Panama,10,Intern,Data Engineer,2,10,NO
2,3,Larue,Spinka,okey_schultz41@gmail.com,2020-04-14,Belarus,4,Mid-Level,Client Success,10,9,YES
3,4,Arch,Spinka,elvera_kulas@yahoo.com,2020-10-01,Eritrea,25,Trainee,QA Manual,7,1,NO
4,5,Larue,Altenwerth,minnie.gislason@gmail.com,2020-05-20,Myanmar,13,Mid-Level,Social Media Community Management,9,7,YES


creating the new database with the hired column

In [62]:
with engine.connect() as conn:
    conn.execute(text("""
    CREATE TABLE IF NOT EXISTS candidates_etl_transformed (
            id SERIAL PRIMARY KEY,
            name VARCHAR(50),
            lastname VARCHAR(50),
            email VARCHAR(50),
            applicationdate DATE,
            country VARCHAR(100),
            yoe INT,
            seniority VARCHAR(20),
            technology VARCHAR(50),
            codescore INT,
            interviewscore INT,
            hired VARCHAR(5)       
        );
"""))

Loading the new database

In [63]:
with engine.connect() as conn:
    df_transform.to_sql("candidates_etl_transformed", con=engine, if_exists="append", index=False)

print("Transformed data stored successfully in 'candidates_etl_transformed'.")

Transformed data stored successfully in 'candidates_etl_transformed'.


In [64]:
with engine.connect() as conn:
    db_transformed_df = pd.read_sql("SELECT * FROM candidates_etl_transformed", conn)
    
db_transformed_df

Unnamed: 0,id,name,lastname,email,applicationdate,country,yoe,seniority,technology,codescore,interviewscore,hired
0,1,Bernadette,Langworth,leonard91@yahoo.com,2021-02-26,Norway,2,Intern,Data Engineer,3,3,NO
1,2,Camryn,Reynolds,zelda56@hotmail.com,2021-09-09,Panama,10,Intern,Data Engineer,2,10,NO
2,3,Larue,Spinka,okey_schultz41@gmail.com,2020-04-14,Belarus,4,Mid-Level,Client Success,10,9,YES
3,4,Arch,Spinka,elvera_kulas@yahoo.com,2020-10-01,Eritrea,25,Trainee,QA Manual,7,1,NO
4,5,Larue,Altenwerth,minnie.gislason@gmail.com,2020-05-20,Myanmar,13,Mid-Level,Social Media Community Management,9,7,YES
...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49996,Bethany,Shields,rocky_mitchell@hotmail.com,2022-01-09,Dominican Republic,27,Trainee,Security,2,1,NO
49996,49997,Era,Swaniawski,dolores.roob@hotmail.com,2020-06-02,Morocco,21,Lead,Game Development,1,2,NO
49997,49998,Martin,Lakin,savanah.stracke@gmail.com,2018-12-15,Uganda,20,Trainee,System Administration,6,1,NO
49998,49999,Aliya,Abernathy,vivienne.fritsch@yahoo.com,2020-05-30,Czech Republic,20,Senior,Database Administration,0,0,NO
