In this lab, data from a car dealer stored in different file formats will be extracted, transformed and loaded into a MySQL database. Thereafter, basic analysis will be performed against the data in the MySQL database

In [21]:
import glob                         # this module helps in selecting files 
import pandas as pd                 # this module helps in processing CSV files
import xml.etree.ElementTree as ET  # this module helps in processing XML files.
from datetime import datetime
import wget
import urllib.request
import pymysql
from mysql.connector import Error
import mysql.connector as sql
from sqlalchemy import create_engine
import boto3
import json
import os

In [None]:
!aws s3 cp s3://wysde-datasets/cars/datasource.zip data/datasource.zip

In [3]:
!cd data && unzip datasource.zip -d dealership_data

Archive:  datasource.zip
  inflating: dealership_data/used_car_prices1.csv  
  inflating: dealership_data/used_car_prices2.csv  
  inflating: dealership_data/used_car_prices3.csv  
  inflating: dealership_data/used_car_prices1.json  
  inflating: dealership_data/used_car_prices2.json  
  inflating: dealership_data/used_car_prices3.json  
  inflating: dealership_data/used_car_prices1.xml  
  inflating: dealership_data/used_car_prices2.xml  
  inflating: dealership_data/used_car_prices3.xml  


In [20]:
DATA_PATH = "./data/dealership_data"

In [24]:
#### CSV extract function
def extract_from_csv(file_to_process):
    dataframe = pd.read_csv(file_to_process)
    return dataframe

#### JSON extract function
def extract_from_json(file_to_process):
    dataframe = pd.read_json(file_to_process,lines=True)
    return dataframe

#### XML extract function
def extract_from_xml(file_to_process):
    dataframe = pd.DataFrame(columns=["car_model", "year_of_manufacture", "price","fuel"])
    tree = ET.parse(file_to_process)
    root = tree.getroot()
    for person in root:
        car_model = person.find("car_model").text
        year_of_manufacture = int(person.find("year_of_manufacture").text)
        price = float(person.find("price").text)
        fuel = person.find("fuel").text
        dataframe = dataframe.append({"car_model":car_model, "year_of_manufacture":year_of_manufacture, 
                                      "price":price,"fuel":fuel}, ignore_index=True)
    return dataframe

#### Extraction and Joining of data into one file
def extract():
    # create an empty data frame to hold extracted data
    extracted_data = pd.DataFrame(columns=['car_model','year_of_manufacture','price', 'fuel'])
    
    #process all csv files
    for csvfile in glob.glob(os.path.join(DATA_PATH, "*.csv")):
        extracted_data = extracted_data.append(extract_from_csv(csvfile), ignore_index=True)
        
    #process all json files
    for jsonfile in glob.glob(os.path.join(DATA_PATH, "*.json")):
        extracted_data = extracted_data.append(extract_from_json(jsonfile), ignore_index=True)
    
    #process all xml files
    for xmlfile in glob.glob(os.path.join(DATA_PATH, "*.xml")):
        extracted_data = extracted_data.append(extract_from_xml(xmlfile), ignore_index=True)
        
    return extracted_data

In [16]:
def transform(data):
    data['price'] = round(data.price,2)
    return data

In [5]:
def get_secret(secret_name, region_name="us-east-1"):
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name)
    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    get_secret_value_response = json.loads(get_secret_value_response['SecretString'])
    return get_secret_value_response

In [12]:
creds = get_secret("wysde")
USERNAME = creds["RDS_MYSQL_USERNAME"]
PASSWORD = creds["RDS_MYSQL_PASSWORD"]
HOST = creds["RDS_MYSQL_HOST"]
DATABASE = "sparsh"

port=3306
connection_string = "mysql+pymysql://%s:%s@%s:%s/%s" % (USERNAME, PASSWORD, HOST, port, DATABASE)

In [27]:
def connect_to_db():
    conn = sql.connect(host=HOST, user=USERNAME, passwd=PASSWORD, db=DATABASE, charset='utf8mb4')
    con_cursor=conn.cursor()
    con_cursor.execute(f'use {DATABASE}')
    con_cursor.execute('drop table if exists used_car_prices')
    con_cursor.execute('''create table used_car_prices(car_model varchar(255) null,year_of_manufacture int null,
                            price double null,fuel varchar(255) null)
                            ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci ROW_FORMAT=COMPRESSED''')
    conn.close()

def load(data_to_load):
    connect_to_db()
    # Creating an engine to quickly push data to database
    engine = create_engine(connection_string)
    data_to_load.to_sql(name='used_car_prices',con=engine,if_exists='append',index=False)

In [25]:
print("ETL Job Started")
print("Extract phase Started")
extracted_data = extract()
print("Extract phase Ended")
print("Transform phase Started")
transformed_data = transform(extracted_data)

ETL Job Started
Extract phase Started
Extract phase Ended
Transform phase Started


In [26]:
# Preview data before loading 
transformed_data.head()

Unnamed: 0,car_model,year_of_manufacture,price,fuel
0,ritz,2014,5000.0,Petrol
1,sx4,2013,7089.55,Diesel
2,ciaz,2017,10820.9,Petrol
3,wagon r,2011,4253.73,Petrol
4,swift,2014,6865.67,Diesel


In [28]:
print("Transform phase Ended")
print("Load phase Started")
load(transformed_data)
print("Load phase Ended")

Transform phase Ended
Load phase Started
Load phase Ended


In [29]:
conn = sql.connect(host=HOST, user=USERNAME, passwd=PASSWORD, db=DATABASE, charset='utf8mb4')

In [30]:
# Yearly average price in 2 decimal place
pd.read_sql_query("""select year_of_manufacture,round(avg(price),2) yearly_avg_price 
                  from used_car_prices group by 1 order by round(avg(price),2) desc""",conn)

Unnamed: 0,year_of_manufacture,yearly_avg_price
0,2017,20315.09
1,2010,14639.3
2,2018,13805.97
3,2015,12126.87
4,2016,12113.8
5,2012,11835.82
6,2014,9667.91
7,2013,8922.06
8,2011,5046.64
9,2005,4656.72


In [31]:
# Top 5 most expensive car model on average
pd.read_sql_query("""select car_model,round(avg(price) ,2) average_price
                    from used_car_prices group by 1
                  order by round(avg(price) ,2) desc limit 5""",conn)

Unnamed: 0,car_model,average_price
0,land cruiser,52238.81
1,fortuner,28671.64
2,innova,19773.63
3,vitara brezza,13805.97
4,ciaz,11152.57


In [32]:
# Average price of fuel
pd.read_sql_query("""select fuel,round(avg(price) ,2) average_price
                    from used_car_prices group by 1
                  order by round(avg(price) ,2) desc limit 5""",conn)

Unnamed: 0,fuel,average_price
0,Diesel,16826.7
1,Petrol,6760.91
2,CNG,4626.87


In [None]:
# Closing connection to MySQL database
conn.close()