In [1]:
import yaml
from sqlalchemy import create_engine
from sqlalchemy import inspect
import pandas as pd
import numpy as np
from data_cleaning import DataCleaning

class DataExtractor:
    def __init__(self):
        self.engine = None
    
    def read_db_creds(self, filename: str) -> dict:
        '''This function reads a YAML file containing database credentials and returns them as a dictionary.
        
        Parameters
        ----------
        filename : str
            The filename parameter is a string that represents the name of the file that contains the database
        credentials.
        
        Returns
        -------
            A dictionary containing the database credentials read from the specified file.
        
        '''
        with open(filename, 'r') as f:
            creds = yaml.safe_load(f)
            
        return creds
    
    def init_db_engine(self, creds: dict):
        '''This function initializes a PostgreSQL database engine using the provided credentials.
        
        Parameters
        ----------
        creds : dict
            The `creds` parameter is a dictionary that contains the credentials needed to connect to a
        PostgreSQL database. It should have the following keys:
        
        Returns
        -------
            The function `init_db_engine` returns a SQLAlchemy engine object that connects to a PostgreSQL
        database using the credentials provided in the `creds` dictionary.
        
        '''
        url = f"postgresql://{creds['RDS_USER']}:{creds['RDS_PASSWORD']}@{creds['RDS_HOST']}:{creds['RDS_PORT']}/{creds['RDS_DATABASE']}"
        self.engine = create_engine(url)
        
        return self.engine
    
    def list_db_tables(self):
        '''This function inspects a database engine and prints out a list of table names.
        
        '''
        inspector = inspect(self.engine)
        table_names = inspector.get_table_names()
        print(table_names)
    
    def read_rds_table(self, table_name: str) -> pd.DataFrame:
        '''This function reads a table from a database using the specified table name and returns it as a
        pandas DataFrame.
        
        Parameters
        ----------
        table_name : str
            The name of the table in the database that you want to read.
        
        Returns
        -------
            The function `read_rds_table` returns a pandas DataFrame containing the data from the specified
        table in the database.
        
        '''
        with self.engine.connect() as conn:
            df = pd.read_sql_table(table_name, con=conn)
            
        return df

if __name__ == '__main__':
    database_extractor = DataExtractor()
    creds_dict = database_extractor.read_db_creds("db_creds.yaml")
    db_engine = database_extractor.init_db_engine(creds_dict)
    #table_list = database_extractor.list_db_tables()
    user_table = database_extractor.read_rds_table("legacy_users")
    print(user_table.columns)

Index(['index', 'first_name', 'last_name', 'date_of_birth', 'company',
       'email_address', 'address', 'country', 'country_code', 'phone_number',
       'join_date', 'user_uuid'],
      dtype='object')


In [6]:
user_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15320 entries, 0 to 1249
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   first_name     15320 non-null  object
 1   last_name      15320 non-null  object
 2   date_of_birth  15320 non-null  object
 3   company        15320 non-null  object
 4   email_address  15320 non-null  object
 5   address        15320 non-null  object
 6   country        15320 non-null  object
 7   country_code   15320 non-null  object
 8   phone_number   15320 non-null  object
 9   join_date      15320 non-null  object
 10  user_uuid      15320 non-null  object
dtypes: object(11)
memory usage: 1.4+ MB


In [2]:
data_cleaner = DataCleaning()
user_table = data_cleaner.clean_user_data(user_table)

In [5]:
user_table.join_date.unique()

array(['2018-10-10', '2001-12-20', '2016-12-16', ..., '2016-04-15',
       '2021-03-07', '2015-08-28'], dtype=object)

In [16]:
#user_table.set_index("index", inplace=True)
user_table.sample(8)

Unnamed: 0_level_0,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3405,Heinz-Willi,Schinke,2004-01-01,Gieß AG & Co. KGaA,karl-augustmatthaei@schueler.de,"Schuchhardtplatz 3/9, 58699 Bogen",Germany,DE,49136585690,2002-04-01,5a57b0c9-7411-4ad6-806e-becbb1cc8318
6445,Edeltraut,Riehl,1979-08-13,Hartung Atzler GmbH,korbinian84@pruschke.de,"Gretel-Hentschel-Platz 3/0, 23554 Wernigerode",Germany,DE,497590950165,2018-02-15,3ad79aa6-8c5f-493c-be47-0595eec2695a
7563,Birte,Höfig,1995-01-15,Schäfer GmbH & Co. KGaA,amielcarek@thies.net,"Gutknechtallee 532, 78970 Osterburg",Germany,DE,493265418030,1996-02-09,8ed35d68-04b7-4b64-bb10-2f4d34aa8469
7888,Cathrin,Stiebitz,1983-04-13,Hiller,christlknappe@heidrich.net,"Laszlo-Etzold-Ring 5/5, 72362 Schongau",Germany,DE,49998302017,2016-09-11,c7cd819a-bbff-49cd-80c9-483feb6041f1
6677,Joyce,Walsh,1973-09-07,Pearce-Wells,woodfiona@lee.org,"Flat 9, Miah summit, West Hayley, S4 3XW",United Kingdom,GB,441914960737,2008-11-10,7bf9b15d-c8fd-470b-86be-2d7487e29c44
3404,Emilie,Stahr,1941-07-14,Holt,jdoehn@schweitzer.de,"Stjepan-Preiß-Allee 0/5, 60361 Gerolzhofen",Germany,DE,499691167228,2005-10-20,5dd255d1-82bc-4628-8e8e-79ad1351dae5
14674,Debra,Townsend,1989-02-03,Wall-James,naomi47@brown.com,"5 Ian spur, Suttonton, HU9W 0RS",United Kingdom,GB,442920180156,1999-02-07,d83a65aa-3e5e-448e-8d62-63068a24a307
10433,Larissa,Kraushaar,1962-12-09,Schlosser KG,gspiess@mueller.de,"Thiesstr. 15, 97488 Bützow",Germany,DE,49366998195,2010-12-11,f7c5e6b6-43c8-40ea-8cbe-4e68c2d83f3f


In [9]:
user_table[user_table["phone_number"].str.contains("[qwertyuioplkjhgfdsazxcvbnm]", regex=True) & (user_table["join_date"] == "GB")]

Unnamed: 0_level_0,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


In [10]:
user_table.iloc[3187]

first_name                                             Chelsea
last_name                                               Murphy
date_of_birth                                       1940-04-28
company                                           Read-Harding
email_address                              jadepowell@bird.com
address          Flat 3, Preston curve, North Lindsey, ML9 6PU
country                                         United Kingdom
country_code                                                GB
phone_number                                    00449098790034
join_date                                           2017-07-17
user_uuid                 ad783032-0a1f-4632-8f33-b808d5918f88
Name: 3200, dtype: object

In [6]:
values = []
char = set()
for number in user_table["phone_number"]:
    for chars in number:
        if chars in "qwertyuioplkjhgfdsazxcvbnm!#$%&'()*.-+,/:;?@[\]^_`{|}~":
            values.append(number)
            char.add(chars)
            break
print(char)
#user_table[user_table['phone_number'].isin(values)]
#indices = user_table[user_table['first_name'].isin(values)].index
#user_table.drop(indices, inplace=True)

{'+', 'x', '('}


In [7]:
# check for nulls
user_table.isnull().sum()

first_name       0
last_name        0
date_of_birth    0
company          0
email_address    0
address          0
country          0
country_code     0
phone_number     0
join_date        0
user_uuid        0
dtype: int64