# EXTRACT

In [3]:
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import re
import os

DATA_BASE_DIR = "data_source"

def extract_file_to_dataframe(file_name: str):
    """
    Extracts from CSV, JSON, or XML file and converts to a Pandas DataFrame.
    """
    file_extension = get_file_extension(file_name)

    used_car_prices_df = None

    if file_extension == "csv":
        used_car_prices_df = pd.read_csv(file_name)
    elif file_extension == "json":
        used_car_prices_df = pd.read_json(file_name, lines=True)
    elif file_extension == "xml":
        used_car_prices_df = pd.DataFrame(columns=["car_model", "year_of_manufacture", "price", "fuel"])
        tree = ET.parse(file_name)
        root = tree.getroot()

        for car in root:
            car_model = car.find("car_model").text
            year_of_manufacture = car.find("year_of_manufacture").text
            price = car.find("price").text
            fuel = car.find("fuel").text
            
            used_car_prices_df = pd.concat([
                used_car_prices_df,
                pd.DataFrame([{
                    "car_model": car_model,
                    "year_of_manufacture": year_of_manufacture,
                    "price": price,
                    "fuel": fuel
                }])
            ])
    
    return used_car_prices_df

def combine_extractions_to_dataframe():
    used_car_prices_df = pd.DataFrame(columns=["car_model", "year_of_manufacture", "price", "fuel"])

    # note: simple non-recursive file listing
    for file_name in os.listdir(f"./{DATA_BASE_DIR}"):
        file_extension = get_file_extension(file_name)

        if file_extension not in ["csv", "json", "xml"]:
            continue

        current_dir = os.getcwd()
        full_file_path = f"{current_dir}\\{DATA_BASE_DIR}\\{file_name}"

        df = extract_file_to_dataframe(full_file_path)

        used_car_prices_df = pd.concat([ used_car_prices_df, pd.DataFrame(df) ], ignore_index=True)
    
    return used_car_prices_df


## Helper method
def get_file_extension(file_name: str):
    pattern = r"[.](.*$)"
    extension = re.findall(pattern, file_name)

    try:
        return extension[0]
    except IndexError as e:
        return None

In [4]:
used_car_prices_df = combine_extractions_to_dataframe()
used_car_prices_df

Unnamed: 0,car_model,year_of_manufacture,price,fuel
0,ritz,2014,5000.0,Petrol
1,sx4,2013,7089.552239,Diesel
2,ciaz,2017,10820.895522,Petrol
3,wagon r,2011,4253.731343,Petrol
4,swift,2014,6865.671642,Diesel
...,...,...,...,...
85,camry,2006,3731.3432835820895,Petrol
86,land cruiser,2010,52238.80597014925,Diesel
87,corolla altis,2012,8805.970149253732,Petrol
88,etios liva,2013,5149.253731343284,Petrol


In [8]:
os.getcwd()

'c:\\Users\\prisc\\Documents\\2_MOOC-COURSES\\Python Project for Data Engineering (Coursera)\\ETL_Lab'

In [6]:
import os

print(os.listdir("./data_source"))

['datasource.zip', 'used_car_prices1.csv', 'used_car_prices1.json', 'used_car_prices1.xml', 'used_car_prices2.csv', 'used_car_prices2.json', 'used_car_prices2.xml', 'used_car_prices3.csv', 'used_car_prices3.json', 'used_car_prices3.xml']


# TRANSFORM

In [8]:
def round_prices_to_two_decimals():
    used_car_prices_df["price"] = used_car_prices_df["price"].astype(float)
    used_car_prices_df["price"] = np.round(used_car_prices_df["price"], 2)

round_prices_to_two_decimals()
used_car_prices_df

Unnamed: 0,car_model,year_of_manufacture,price,fuel
0,ritz,2014,5000.00,Petrol
1,sx4,2013,7089.55,Diesel
2,ciaz,2017,10820.90,Petrol
3,wagon r,2011,4253.73,Petrol
4,swift,2014,6865.67,Diesel
...,...,...,...,...
85,camry,2006,3731.34,Petrol
86,land cruiser,2010,52238.81,Diesel
87,corolla altis,2012,8805.97,Petrol
88,etios liva,2013,5149.25,Petrol


# LOAD

In [10]:
def convert_dataframe_to_csv():
    used_car_prices_df.to_csv("combined_used_car_prices.csv", index=False)

convert_dataframe_to_csv()