In [10]:
import pandas as pd
import glob
from datetime import datetime
import xml.etree.ElementTree as ET

In [26]:
logfile    = "../All Data_Sets/logfile.txt"    
targetfile = "../All Data_Sets/targetfile.csv"

In [29]:
def extract_from_csv(file_to_process): 
    dataframe = pd.read_csv(file_to_process) 
    return dataframe


def extract_from_json(file_to_process):
    dataframe = pd.read_json(file_to_process,lines=True)
    return dataframe


def extract_from_xml(file_to_process):
    dataframe = pd.DataFrame(columns=['car_model','year_of_manufacture','price', 'fuel'])
    tree = ET.parse(file_to_process) 
    root = tree.getroot() 
    for person in root: 
        car_model = person.find("car_model").text 
        year_of_manufacture = int(person.find("year_of_manufacture").text)
        price = float(person.find("price").text) 
        fuel = person.find("fuel").text 
        dataframe = pd.concat([dataframe,pd.Series(
                                    {"car_model":car_model,
                                     "year_of_manufacture":year_of_manufacture,
                                     "price":price, "fuel":fuel})],
                                ignore_index=True) 
    return dataframe

In [24]:
def extract():
    extracted_data = pd.DataFrame(columns=['car_model','year_of_manufacture','price', 'fuel']) 
    #for csv files
    for csvfile in glob.glob("../All Data_Sets/Data For Extraction/*.csv"):
        extracted_data = pd.concat([extracted_data,extract_from_csv(csvfile)], ignore_index=True)
    #for json files
    for jsonfile in glob.glob("../All Data_Sets/Data For Extraction/*.json"):
        extracted_data = pd.concat([extracted_data,extract_from_json(jsonfile)], ignore_index=True)
    #for xml files
    for xmlfile in glob.glob("../All Data_Sets/Data For Extraction/*.xml"):
        extracted_data = pd.concat([extracted_data,extract_from_xml(xmlfile)], ignore_index=True)
    return extracted_data

In [13]:
def transform(data):
    data['price'] = data['price'].apply(lambda x : round(x, 2))
    return data

In [22]:
def load(targetfile,data_to_load):
    data_to_load.to_csv(targetfile)
    

def log(message):
    timestamp_format = '%H:%M:%S-%h-%d-%Y'
    now = datetime.now()
    timestamp = now.strftime(timestamp_format)
    with open(logfile,"a") as f: f.write(timestamp + ' : ' + message + '\n')

In [30]:
log("ETL Job Started")
log("Extract phase Started")
extracted_data = extract()
log("Extract phase Ended")
log("Transform phase Started")
transformed_data = transform(extracted_data)
log("Transform phase Ended")
log("Load phase Started")
load(targetfile,transformed_data)
log("Load phase Ended")
log("ETL Job Ended")