## Setup

In [1]:
import os
import glob                         # this module helps in selecting files 
import pandas as pd                 # this module helps in processing CSV files
import xml.etree.ElementTree as ET  # this module helps in processing XML files.
from datetime import datetime

In [2]:
DATA_ROOT = "./data"
TMP_FILE = "./data/temp/tmp" # file used to store all extracted data
LOG_FILE = "log.txt" # all event logs will be stored in this 
TARGET_FILE = "./data/transformed_data.csv" # file where transformed data is stored

In [4]:
def log(message):
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second
    now = datetime.now() # get current timestamp
    timestamp = now.strftime(timestamp_format)
    with open(LOG_FILE, "a") as f:
        f.write(timestamp + ',' + message + '\n')

## Extract

In [5]:
def extract_from_csv(file_to_process):
    dataframe = pd.read_csv(file_to_process)
    return dataframe

In [6]:
def extract_from_json(file_to_process):
    dataframe = pd.read_json(file_to_process, lines=True)
    return dataframe

In [7]:
def extract_from_xml(file_to_process):
    dataframe = pd.DataFrame(columns=['car_model','year_of_manufacture','price', 'fuel'])
    tree = ET.parse(file_to_process)
    root = tree.getroot()
    for person in root:
        car_model = person.find("car_model").text
        year_of_manufacture = int(person.find("year_of_manufacture").text)
        price = float(person.find("price").text)
        fuel = person.find("fuel").text
        dataframe = dataframe.append({"car_model":car_model, "year_of_manufacture":year_of_manufacture, "price":price, "fuel":fuel}, ignore_index=True)
    return dataframe

In [15]:
def extract():
    extracted_data = pd.DataFrame(columns=['car_model','year_of_manufacture','price', 'fuel']) # create an empty data frame to hold extracted data
    
    #process all csv files
    for csvfile in glob.glob(os.path.join(DATA_ROOT, "*.csv")):
        extracted_data = extracted_data.append(extract_from_csv(csvfile), ignore_index=True)
        
    #process all json files
    for jsonfile in glob.glob(os.path.join(DATA_ROOT, "*.json")):
        extracted_data = extracted_data.append(extract_from_json(jsonfile), ignore_index=True)
    
    #process all xml files
    for xmlfile in glob.glob(os.path.join(DATA_ROOT, "*.xml")):
        extracted_data = extracted_data.append(extract_from_xml(xmlfile), ignore_index=True)
        
    return extracted_data

## Transform

The transform function does the following tasks.

1.  Round the `price` columns to 2 decimal places

In [9]:
def transform(data):
    data['price'] = round(data.price, 2)
    return data

## Load

In [10]:
def load(targetfile, data_to_load):
    data_to_load.to_csv(targetfile) 

## Running ETL Process

In [19]:
log("ETL Job Started")

In [20]:
log("Extract phase Started")
extracted_data = extract()
log("Extract phase Ended")
extracted_data

Unnamed: 0.1,car_model,year_of_manufacture,price,fuel,Unnamed: 0
0,ritz,2014,5000.000000,Petrol,
1,sx4,2013,7089.552239,Diesel,
2,ciaz,2017,10820.895522,Petrol,
3,wagon r,2011,4253.731343,Petrol,
4,swift,2014,6865.671642,Diesel,
...,...,...,...,...,...
175,etios liva,2014,7089.552239,Diesel,
176,innova,2017,29477.611940,Petrol,
177,fortuner,2010,13805.970149,Diesel,
178,corolla altis,2011,6492.537313,Petrol,


In [21]:
log("Transform phase Started")
transformed_data = transform(extracted_data)
log("Transform phase Ended")
transformed_data 

Unnamed: 0.1,car_model,year_of_manufacture,price,fuel,Unnamed: 0
0,ritz,2014,5000.00,Petrol,
1,sx4,2013,7089.55,Diesel,
2,ciaz,2017,10820.90,Petrol,
3,wagon r,2011,4253.73,Petrol,
4,swift,2014,6865.67,Diesel,
...,...,...,...,...,...
175,etios liva,2014,7089.55,Diesel,
176,innova,2017,29477.61,Petrol,
177,fortuner,2010,13805.97,Diesel,
178,corolla altis,2011,6492.54,Petrol,


In [22]:
log("Load phase Started")
load(TARGET_FILE, transformed_data)
log("Load phase Ended")

In [None]:
log("ETL Job Ended")