
EXTRACTION 

In [16]:
import glob # for path of file
import pandas as pd
import xml.etree.ElementTree as ET
from datetime import datetime

In [1]:
log_file = "log_file"
target_file = "transformed_data.csv" 

In [5]:
# extract from CSV file 
def extract_from_csv(file_to_process):
    df = pd.read_csv(file_to_process)
    return df

In [6]:
# extract from JSON file 
def extract_from_json(file_to_process):
    df = pd.read_json(file_to_process, lines=True)
    return df

In [10]:
import pandas as pd

# Create DataFrames
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5], 'B': [6]})

# Use concat to add data
result = pd.concat([df1, df2], ignore_index=True)
print(result)

   A  B
0  1  3
1  2  4
2  5  6


In [14]:
# extract from XML file 

def extract_from_xml(file_to_process): 
    dataframe = pd.DataFrame(columns=["name", "height", "weight"]) 
    tree = ET.parse(file_to_process) 
    root = tree.getroot() 
    for person in root: 
        name = person.find("name").text 
        height = float(person.find("height").text) 
        weight = float(person.find("weight").text) 
        dataframe = pd.concat([dataframe, pd.DataFrame([{"name":name, "height":height, "weight":weight}])], ignore_index=True) 
    return dataframe 

In [15]:
def extract(): 
    extracted_data = pd.DataFrame(columns=['name','height','weight']) # create an empty data frame to hold extracted data 
     
    # process all csv files, except the target file
    for csvfile in glob.glob("*.csv"): 
        if csvfile != target_file:  # check if the file is not the target file
            extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_csv(csvfile))], ignore_index=True) 
         
    # process all json files 
    for jsonfile in glob.glob("*.json"): 
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_json(jsonfile))], ignore_index=True) 
     
    # process all xml files 
    for xmlfile in glob.glob("*.xml"): 
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_xml(xmlfile))], ignore_index=True) 
         
    return extracted_data 

TRANSFORMATION 

In [17]:
def transform(data): 
    data['height'] = round(data.height * 0.0254, 2)
    data['weight'] = round(data.weight * 0.45359237, 2)

    return data
    

LOADING AND LOGGING

In [18]:
def load_data(target_file, transformed_data): 
    transformed_data.to_csv(target_file)

In [19]:
def log_progress(message): 
    timestamp_format = '%Y-%h-%d-%H:%M:%S'
    now = datetime.now() # get current timestamp 
    timestamp = now.strftime(timestamp_format) 
    with open(log_file,"a") as f: 
        f.write(timestamp + ',' + message + '\n') 

TESTING AND LOGGING 

In [22]:
# Log the initialization of the ETL process 
log_progress("ETL Job Started") 
 

 


In [23]:
# Log the beginning of the Extraction process 
log_progress("Extract phase Started") 
extracted_data = extract() 

  extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_csv(csvfile))], ignore_index=True)
  dataframe = pd.concat([dataframe, pd.DataFrame([{"name":name, "height":height, "weight":weight}])], ignore_index=True)


In [24]:
# Log the completion of the Extraction process 
log_progress("Extract phase Ended") 
 

In [25]:
# Log the beginning of the Transformation process 
log_progress("Transform phase Started") 
transformed_data = transform(extracted_data) 
print("Transformed Data") 
print(transformed_data) 
 

Transformed Data
     name  height  weight
0    alex    1.67   51.25
1    ajay    1.82   61.91
2   alice    1.76   69.41
3    ravi    1.73   64.56
4     joe    1.72   65.45
5    jack    1.74   55.93
6     tom    1.77   64.18
7   tracy    1.78   61.90
8    john    1.72   50.97
9   simon    1.72   50.97
10  jacob    1.70   54.73
11  cindy    1.69   57.81
12   ivan    1.72   51.77


In [26]:
# Log the completion of the Transformation process 
log_progress("Transform phase Ended") 
 

In [27]:
# Log the beginning of the Loading process 
log_progress("Load phase Started") 
load_data(target_file,transformed_data) 
 

In [28]:
 #Log the completion of the Loading process 
log_progress("Load phase Ended") 
 

In [29]:
# Log the completion of the ETL process 
log_progress("ETL Job Ended") 