# ETL

In [None]:
import glob                          # this module helps in selecting files
import pandas as pd                  # this module helps in processing CSV files
import xml.etree.ElementTree as Et   # this module helps in processing XML files.
import urllib.request                # to download the data set
from datetime import datetime


In [131]:
urllib.request.urlretrieve('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0221EN-SkillsNetwork/labs/module%206/Lab%20-%20Extract%20Transform%20Load/data/source.zip', 'source.zip')

('source.zip', <http.client.HTTPMessage at 0x23eaf62b220>)

A TMP file is a temporary backup, cache, or other data file created automatically by a software program. It is sometimes created as an invisible file and is often deleted when the program is closed. TMP files may also be created to contain information temporarily while a new file is being generated.

In [132]:
#set paths
tmpfile    = "temp.tmp"               # file used to store all extracted data
logfile    = "logfile.txt"            # all event logs will be stored in this file
targetfile = "transformed_data.csv"   # file where transformed data is stored


### Extract

##### Csv Extract

In [133]:
def extract_from_csv(file_to_process):
    dataframe = pd.read_csv(file_to_process)
    return dataframe


#### Json Extract

In [134]:
def extract_from_json(file_to_process):
    dataframe = pd.read_json(file_to_process,lines=True)
    return dataframe

#### XMl Extract

In [135]:
def extract_from_xml(file_to_process):
    columns = ['name','height', 'weight']
    #or dataframe = pd.DataFrame(columns=["name", "height", "weight"])
    dataframe = pd.DataFrame(columns = columns)
    #parsing
    tree = Et.parse(file_to_process)
    root = tree.getroot()
    for person in root: #person is a node 
        name = person.find("name").text
        height = float(person.find("height").text)
        weight = float(person.find("weight").text)
        dataframe = dataframe.append(pd.Series([name, height, weight], index = columns), ignore_index=True)
        # or dataframe = dataframe.append({"name":name, "height":height, "weight":weight}, ignore_index=True)
    return dataframe



#### Preparation (same goes for xml & json files)

In [136]:
#Create a date object:
x = datetime.now()
print(x)

2022-03-15 16:39:58.810562


In [137]:
#display the name of month
x = datetime(2020, 8, 1)

print(x.strftime("%B")) #%B for full month name, check https://www.w3schools.com/python/python_datetime.asp

August


In [138]:
#to get all the csv files
list_csv = glob.glob("source/*.csv")
list_csv

['source\\source1.csv', 'source\\source2.csv', 'source\\source3.csv']

In [139]:
#iterating
for csvfile in glob.glob('source/*.csv'): #iterating thru each csvfile
    print(csvfile)

source\source1.csv
source\source2.csv
source\source3.csv


In [140]:
#appending all the csvs
extracted_dt = pd.DataFrame(columns=['name','height','weight'])
for csvfile in glob.glob('source/*.csv'): #iterating thru each csvfile
    extracted_dt = extracted_dt.append(extract_from_csv(csvfile), ignore_index=True) #passing csvfile to the *Csv Extract* to read the csv files using pandas
extracted_dt

Unnamed: 0,name,height,weight
0,akshay,65.78,112.99
1,ajay,71.52,136.49
2,alice,69.4,153.03
3,ravi,68.22,142.34
4,joe,67.79,144.3
5,akshay,65.78,112.99
6,ajay,71.52,136.49
7,alice,69.4,153.03
8,ravi,68.22,142.34
9,joe,67.79,144.3


In [141]:
#alternative (appending all csvs)
extracted_dt = pd.DataFrame(columns=['name','height','weight'])
for csvfile in glob.glob('source/*.csv'): #iterating thru each csvfile
    extracted_dt = extracted_dt.append(pd.read_csv(csvfile), ignore_index=True) #passing csvfile to the *Csv Extract* to read the csv files using pandas
extracted_dt

Unnamed: 0,name,height,weight
0,akshay,65.78,112.99
1,ajay,71.52,136.49
2,alice,69.4,153.03
3,ravi,68.22,142.34
4,joe,67.79,144.3
5,akshay,65.78,112.99
6,ajay,71.52,136.49
7,alice,69.4,153.03
8,ravi,68.22,142.34
9,joe,67.79,144.3


#### Extract/composite functions

In [142]:
def extract():
    extracted_data = pd.DataFrame(columns=['name','height','weight']) # create an empty data frame to hold extracted data
     
    #process all csv files
    for csvfile in glob.glob("source/*.csv"): #iterating thru each csvfile
        extracted_data = extracted_data.append(extract_from_csv(csvfile), ignore_index=True) #passing csvfile to the *Csv Extract*

    #process all json files
    for jsonfile in glob.glob("source/*.json"):
        extracted_data = extracted_data.append(extract_from_json(jsonfile), ignore_index=True)
    
    #process all xml files
    for xmlfile in glob.glob("source/*.xml"):
        extracted_data = extracted_data.append(extract_from_xml(xmlfile), ignore_index=True)
        
    return extracted_data
            

#### Transform

The transform function does the following tasks.

1.  Convert height which is in inches to millimeter
2.  Convert weight which is in pounds to kilograms

In [143]:
def transform(data):
        #Convert height which is in inches to millimeter
        #Convert the datatype of the column into float *astype ()*
        data.height = data.height.astype(float)
        data['height'] = round(data.height * 0.0254,2)
        
        #Convert weight which is in pounds to kilograms
        #Convert the datatype of the column into float
        data.weight = data.weight.astype(float)
        #Convert pounds to kilograms and round off to two decimals(one pound is 0.45359237 kilograms)
        data['weight'] = round(data.weight * 0.45359237,2)
        return data

#### Loading

In [144]:
#we gonna load the transformed data into the target file by passing it thru parameter data_do_load
def load(targetfile, data_to_load):
    data_to_load.to_csv(targetfile)

#### logging

In [145]:
def log(message):
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second
    now = datetime.now() # get current timestamp
    timestamp = now.strftime(timestamp_format)
    with open("logfile.txt","a") as f:
        f.write(timestamp + ',' + message + '\n')

#### Running ETL Process

In [146]:
log("ETL Job Started")

In [147]:
#Extract Phase process
log("Extracted Phase Started")
extracted_data = extract()
log("Extract Phase Ended")



In [148]:
#Transform Phase Started
log("Transform Phase Started")
transformed_data = transform(extracted_data)
log("Transform Phase Ended")


In [149]:
#loading phase
log("Load phase Started")
load(targetfile,transformed_data)
log("Load phase Ended")

In [150]:
log("ETL Job Ended")

In [151]:
# reading transformed_data file
# index_col is to remove the un-named index
df = pd.read_csv("transformed_data.csv", index_col=[0])
print(df) 


      name  height  weight
0   akshay    1.67   51.25
1     ajay    1.82   61.91
2    alice    1.76   69.41
3     ravi    1.73   64.56
4      joe    1.72   65.45
5   akshay    1.67   51.25
6     ajay    1.82   61.91
7    alice    1.76   69.41
8     ravi    1.73   64.56
9      joe    1.72   65.45
10  akshay    1.67   51.25
11    ajay    1.82   61.91
12   alice    1.76   69.41
13    ravi    1.73   64.56
14     joe    1.72   65.45
15    jack    1.74   55.93
16     tom    1.77   64.18
17   tracy    1.78   61.90
18    john    1.72   50.97
19    jack    1.74   55.93
20     tom    1.77   64.18
21   tracy    1.78   61.90
22    john    1.72   50.97
23    jack    1.74   55.93
24     tom    1.77   64.18
25   tracy    1.78   61.90
26    john    1.72   50.97
27   simon    1.72   50.97
28   jacob    1.70   54.73
29   cindy    1.69   57.81
30    ivan    1.72   51.77
31   simon    1.72   50.97
32   jacob    1.70   54.73
33   cindy    1.69   57.81
34    ivan    1.72   51.77
35   simon    1.72   50.97
3