# ETL

## Imports

In [2]:
import os
import re

from sqlalchemy import create_engine
import pandas as pd

## Enviroument variables

In [3]:
# export DATABASE_URI='mysql+pymysql://root:root@0.0.0.0:3306/BIKES'
# set DATABASE_URI='mysql+pymysql://root:root@0.0.0.0:3306/BIKES'
DATABASE_URI = os.environ.get('DATABASE_URI')

if not DATABASE_URI:
    raise ValueError('No DATABASE_URI variable was set')

## Database connection

In [13]:
engine = create_engine(DATABASE_URI)
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7feee06e2760>

## Functions

In [4]:
def to_upper(text: str) -> str:
    """To upper case non null text"""
    if text is not None:
        return text.upper()

In [5]:
def to_lower(text: str) -> str:
    """To lower case non null text"""
    if text is not None:
        return text.lower()

In [6]:
def id_to_captilize(text: str) -> str:
    """Replace string non null text that ends with ID to Id"""
    if text is not None:
        return re.sub(r'(^.*)ID', r'\1Id', text)

In [7]:
def to_snake_case(text: str) -> str:
    """To snake case non null text"""
    if text is not None:
        return re.sub(r'(?<!^)(?=[A-Z])', '_', text)

# Person

### Loading dataset

In [6]:
dataframe = pd.read_csv('../dataset/Person.Person.csv', sep=';')
dataframe.head()

Unnamed: 0,BusinessEntityID,PersonType,NameStyle,Title,FirstName,MiddleName,LastName,Suffix,EmailPromotion,AdditionalContactInfo,Demographics,rowguid,ModifiedDate
0,1,EM,0,,Ken,J,Sánchez,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",92C4279F-1207-48A3-8448-4636514EB7E2,2009-01-07 00:00:00.000
1,2,EM,0,,Terri,Lee,Duffy,,1,,"<IndividualSurvey xmlns=""http://schemas.micros...",D8763459-8AA8-47CC-AFF7-C9079AF79033,2008-01-24 00:00:00.000
2,3,EM,0,,Roberto,,Tamburello,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",E1A2555E-0828-434B-A33B-6F38136A37DE,2007-11-04 00:00:00.000
3,4,EM,0,,Rob,,Walters,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",F2D7CE06-38B3-4357-805B-F4B6B71C01FF,2007-11-28 00:00:00.000
4,5,EM,0,Ms.,Gail,A,Erickson,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",F3A3F6B4-AE3B-430C-A754-9F2231BA6FEF,2007-12-30 00:00:00.000


### Rename dataframe header

In [7]:
dataframe.columns

Index(['BusinessEntityID', 'PersonType', 'NameStyle', 'Title', 'FirstName',
       'MiddleName', 'LastName', 'Suffix', 'EmailPromotion',
       'AdditionalContactInfo', 'Demographics', 'rowguid', 'ModifiedDate'],
      dtype='object')

In [8]:
dataframe.columns = [to_upper((to_snake_case(id_to_captilize(column)))) for column in dataframe.columns]

In [9]:
dataframe.columns

Index(['BUSINESS_ENTITY_ID', 'PERSON_TYPE', 'NAME_STYLE', 'TITLE',
       'FIRST_NAME', 'MIDDLE_NAME', 'LAST_NAME', 'SUFFIX', 'EMAIL_PROMOTION',
       'ADDITIONAL_CONTACT_INFO', 'DEMOGRAPHICS', 'ROWGUID', 'MODIFIED_DATE'],
      dtype='object')

In [10]:
dataframe.rename({'BUSINESS_ENTITY_ID': 'PERSON_ID'}, axis=1, inplace=True)

### Fix TITLE column

In [11]:
dataframe.TITLE.value_counts()

Mr.     577
Ms.     415
Sr.      11
Sra.      3
Mrs.      2
Ms        1
Name: TITLE, dtype: int64

In [12]:
dataframe.TITLE.replace({'Ms': 'Ms.'}, inplace=True)

In [13]:
dataframe.TITLE.value_counts()

Mr.     577
Ms.     416
Sr.      11
Sra.      3
Mrs.      2
Name: TITLE, dtype: int64

### Populate

In [None]:
dataframe.to_sql('PERSON', engine, if_exists='append', index=False)

### Select

In [15]:
query = """
SELECT *
FROM PERSON
LIMIT 3
"""

In [17]:
pd.read_sql(query, engine)

Unnamed: 0,PERSON_ID,PERSON_TYPE,NAME_STYLE,TITLE,FIRST_NAME,MIDDLE_NAME,LAST_NAME,SUFFIX,EMAIL_PROMOTION,ADDITIONAL_CONTACT_INFO,DEMOGRAPHICS,ROWGUID,MODIFIED_DATE
0,1,EM,0,,Ken,J,Sánchez,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",92C4279F-1207-48A3-8448-4636514EB7E2,2009-01-07
1,2,EM,0,,Terri,Lee,Duffy,,1,,"<IndividualSurvey xmlns=""http://schemas.micros...",D8763459-8AA8-47CC-AFF7-C9079AF79033,2008-01-24
2,3,EM,0,,Roberto,,Tamburello,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",E1A2555E-0828-434B-A33B-6F38136A37DE,2007-11-04


# Customer

In [8]:
dataframe = pd.read_csv('../dataset/Sales.Customer.csv', sep=';')
dataframe.head()

Unnamed: 0,CustomerID,PersonID,StoreID,TerritoryID,AccountNumber,rowguid,ModifiedDate
0,1,,934.0,1,AW00000001,3F5AE95E-B87D-4AED-95B4-C3797AFCB74F,2014-09-12 11:15:07.263
1,2,,1028.0,1,AW00000002,E552F657-A9AF-4A7D-A645-C429D6E02491,2014-09-12 11:15:07.263
2,3,,642.0,4,AW00000003,130774B1-DB21-4EF3-98C8-C104BCD6ED6D,2014-09-12 11:15:07.263
3,4,,932.0,4,AW00000004,FF862851-1DAA-4044-BE7C-3E85583C054D,2014-09-12 11:15:07.263
4,5,,1026.0,4,AW00000005,83905BDC-6F5E-4F71-B162-C98DA069F38A,2014-09-12 11:15:07.263


### Rename dataframe header

In [9]:
dataframe.columns

Index(['CustomerID', 'PersonID', 'StoreID', 'TerritoryID', 'AccountNumber',
       'rowguid', 'ModifiedDate'],
      dtype='object')

In [10]:
dataframe.columns = [to_upper((to_snake_case(id_to_captilize(column)))) for column in dataframe.columns]

In [11]:
dataframe.columns

Index(['CUSTOMER_ID', 'PERSON_ID', 'STORE_ID', 'TERRITORY_ID',
       'ACCOUNT_NUMBER', 'ROWGUID', 'MODIFIED_DATE'],
      dtype='object')

### Populate

In [14]:
dataframe.to_sql('CUSTOMER', engine, if_exists='append', index=False)

In [18]:
query = """
SELECT *
FROM CUSTOMER
LIMIT 3
"""

In [19]:
pd.read_sql(query, engine)

Unnamed: 0,CUSTOMER_ID,PERSON_ID,STORE_ID,TERRITORY_ID,ACCOUNT_NUMBER,ROWGUID,MODIFIED_DATE
0,1,,934,1,AW00000001,3F5AE95E-B87D-4AED-95B4-C3797AFCB74F,2014-09-12 11:15:07
1,2,,1028,1,AW00000002,E552F657-A9AF-4A7D-A645-C429D6E02491,2014-09-12 11:15:07
2,3,,642,4,AW00000003,130774B1-DB21-4EF3-98C8-C104BCD6ED6D,2014-09-12 11:15:07
