# ETL

## Imports

In [1]:
import re

from sqlalchemy import create_engine
import pandas as pd

## Functions

In [2]:
def to_upper(text: str) -> str:
    """To upper case non null text"""
    if text is not None:
        return text.upper()

In [3]:
def to_lower(text: str) -> str:
    """To lower case non null text"""
    if text is not None:
        return text.lower()

In [4]:
def id_to_captilize(text: str) -> str:
    """Replace string non null text that ends with ID to Id"""
    if text is not None:
        return re.sub(r'(^.*)ID', r'\1Id', text)

In [5]:
def to_snake_case(text: str) -> str:
    """To snake case non null text"""
    if text is not None:
        return re.sub(r'(?<!^)(?=[A-Z])', '_', text)

# Person

## Loading dataset

In [6]:
dataframe = pd.read_csv('../dataset/Person.Person.csv', sep=';')
dataframe.head()

Unnamed: 0,BusinessEntityID,PersonType,NameStyle,Title,FirstName,MiddleName,LastName,Suffix,EmailPromotion,AdditionalContactInfo,Demographics,rowguid,ModifiedDate
0,1,EM,0,,Ken,J,Sánchez,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",92C4279F-1207-48A3-8448-4636514EB7E2,2009-01-07 00:00:00.000
1,2,EM,0,,Terri,Lee,Duffy,,1,,"<IndividualSurvey xmlns=""http://schemas.micros...",D8763459-8AA8-47CC-AFF7-C9079AF79033,2008-01-24 00:00:00.000
2,3,EM,0,,Roberto,,Tamburello,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",E1A2555E-0828-434B-A33B-6F38136A37DE,2007-11-04 00:00:00.000
3,4,EM,0,,Rob,,Walters,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",F2D7CE06-38B3-4357-805B-F4B6B71C01FF,2007-11-28 00:00:00.000
4,5,EM,0,Ms.,Gail,A,Erickson,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",F3A3F6B4-AE3B-430C-A754-9F2231BA6FEF,2007-12-30 00:00:00.000


## Rename dataframe header

In [7]:
dataframe.columns

Index(['BusinessEntityID', 'PersonType', 'NameStyle', 'Title', 'FirstName',
       'MiddleName', 'LastName', 'Suffix', 'EmailPromotion',
       'AdditionalContactInfo', 'Demographics', 'rowguid', 'ModifiedDate'],
      dtype='object')

In [8]:
dataframe.columns = [to_upper((to_snake_case(id_to_captilize(column)))) for column in dataframe.columns]

In [9]:
dataframe.columns

Index(['BUSINESS_ENTITY_ID', 'PERSON_TYPE', 'NAME_STYLE', 'TITLE',
       'FIRST_NAME', 'MIDDLE_NAME', 'LAST_NAME', 'SUFFIX', 'EMAIL_PROMOTION',
       'ADDITIONAL_CONTACT_INFO', 'DEMOGRAPHICS', 'ROWGUID', 'MODIFIED_DATE'],
      dtype='object')

In [10]:
dataframe.rename({'BUSINESS_ENTITY_ID': 'PERSON_ID'}, axis=1, inplace=True)

## Fix TITLE column

In [11]:
dataframe.TITLE.value_counts()

Mr.     577
Ms.     415
Sr.      11
Sra.      3
Mrs.      2
Ms        1
Name: TITLE, dtype: int64

In [12]:
dataframe.TITLE.replace({'Ms': 'Ms.'}, inplace=True)

In [13]:
dataframe.TITLE.value_counts()

Mr.     577
Ms.     416
Sr.      11
Sra.      3
Mrs.      2
Name: TITLE, dtype: int64

## Database connection

In [14]:
engine = create_engine('mysql+pymysql://root:root@0.0.0.0:3306/BIKES')
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7f3dba94d310>

## Populate

In [15]:
dataframe.to_sql('PERSON', engine, if_exists='append', index=False)

## Select

In [16]:
query = """
SELECT *
FROM PERSON
"""

In [17]:
pd.read_sql(query, engine).head(3)

Unnamed: 0,PERSON_ID,PERSON_TYPE,NAME_STYLE,TITLE,FIRST_NAME,MIDDLE_NAME,LAST_NAME,SUFFIX,EMAIL_PROMOTION,ADDITIONAL_CONTACT_INFO,DEMOGRAPHICS,ROWGUID,MODIFIED_DATE
0,1,EM,0,,Ken,J,Sánchez,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",92C4279F-1207-48A3-8448-4636514EB7E2,2009-01-07
1,2,EM,0,,Terri,Lee,Duffy,,1,,"<IndividualSurvey xmlns=""http://schemas.micros...",D8763459-8AA8-47CC-AFF7-C9079AF79033,2008-01-24
2,3,EM,0,,Roberto,,Tamburello,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",E1A2555E-0828-434B-A33B-6F38136A37DE,2007-11-04


# Customer