# Sample data from the Kaggle 
- link: https://www.kaggle.com/datasets/tawfikelmetwally/employee-dataset/

## Import the required modules

In [1]:
import os
import logging
import configparser

# Read the Config file 

In [2]:
config = configparser.ConfigParser()
root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
config.read(os.path.join(root_path, "configs","config.ini"))

['/opt/spark/work-dir/configs/config.ini']

# Logging function 

In [4]:
# Create a logger
logger = logging.getLogger("EMP_DELTA_UPLOAD")

# Set the level of the logger
logger.setLevel(logging.INFO)

# Create a FileHandler object and specify the file path
log_file_path = config['LOGS']['log_path']
file_path = os.path.join(log_file_path,"employee_info.log")
os.makedirs(os.path.dirname(file_path), exist_ok=True)
file_handler = logging.FileHandler(file_path)

# Set the format for the log messages
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)

# Logger handler 
logger.addHandler(file_handler)


# Read from the data source [Flat file]

In [5]:
try:
    logger.info("start reading the raw data")
    emp_file = os.path.join(config['DATA']['data_path'],"Datq", "datq_employees.csv")
    empdf = (spark
             .read.option("header", True)
             .option("inferSchema",True)
             .csv(emp_file)
            )
    logger.info("end reading the raw data")
except: 
    logger.error("cannot read raw data")

INFO:EMP_DELTA_UPLOAD:end reading the raw data


In [8]:
empdf.show(1,truncate=True, vertical=True)

-RECORD 0-----------------------------------
 Age                      | 41              
 Attrition                | 1               
 BusinessTravel           | Travel_Rarely   
 DailyRate                | 1102            
 Department               | Sales           
 DistanceFromHome         | 1               
 Education                | 2               
 EducationField           | Life Sciences   
 EmployeeCount            | 1               
 EmployeeNumber           | 1               
 EnvironmentSatisfaction  | 2               
 Gender                   | Female          
 HourlyRate               | 94              
 JobInvolvement           | 3               
 JobLevel                 | 2               
 JobRole                  | Sales Executive 
 JobSatisfaction          | 4               
 MaritalStatus            | Single          
 MonthlyIncome            | 5993            
 MonthlyRate              | 19479           
 NumCompaniesWorked       | 8               
 Over18   

In [9]:
empdf.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Attrition: integer (nullable = true)
 |-- BusinessTravel: string (nullable = true)
 |-- DailyRate: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- DistanceFromHome: integer (nullable = true)
 |-- Education: integer (nullable = true)
 |-- EducationField: string (nullable = true)
 |-- EmployeeCount: integer (nullable = true)
 |-- EmployeeNumber: integer (nullable = true)
 |-- EnvironmentSatisfaction: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- HourlyRate: integer (nullable = true)
 |-- JobInvolvement: integer (nullable = true)
 |-- JobLevel: integer (nullable = true)
 |-- JobRole: string (nullable = true)
 |-- JobSatisfaction: integer (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- MonthlyIncome: integer (nullable = true)
 |-- MonthlyRate: integer (nullable = true)
 |-- NumCompaniesWorked: integer (nullable = true)
 |-- Over18: string (nullable = true)
 |-- OverTime: string

# Save Data to the Delta File

In [10]:
try:
    logger.info("start writing data to the delta table")
    emp_save_path = os.path.join(
        config["DELTA"]["delta_table_path"], "datq", "employee"
    )
    (empdf
      .write
      .format("delta")
      .mode("overwrite")
      .save(emp_save_path)
    )
    logger.info("end writing data to the delta table")
except:
    logger.error("cannot write the data as delta table")

INFO:EMP_DELTA_UPLOAD:start writing data to the delta table
INFO:EMP_DELTA_UPLOAD:end writing data to the delta table                       


In [11]:
spark.stop()