In [1]:
import yaml
from sqlalchemy import create_engine
from sqlalchemy import inspect
import pandas as pd
import numpy as np
from data_cleaning import DataCleaning

class DataExtractor:
    def __init__(self):
        self.engine = None
    
    def read_db_creds(self, filename: str) -> dict:
        '''This function reads a YAML file containing database credentials and returns them as a dictionary.
        
        Parameters
        ----------
        filename : str
            The filename parameter is a string that represents the name of the file that contains the database
        credentials.
        
        Returns
        -------
            A dictionary containing the database credentials read from the specified file.
        
        '''
        with open(filename, 'r') as f:
            creds = yaml.safe_load(f)
            
        return creds
    
    def init_db_engine(self, creds: dict):
        '''This function initializes a PostgreSQL database engine using the provided credentials.
        
        Parameters
        ----------
        creds : dict
            The `creds` parameter is a dictionary that contains the credentials needed to connect to a
        PostgreSQL database. It should have the following keys:
        
        Returns
        -------
            The function `init_db_engine` returns a SQLAlchemy engine object that connects to a PostgreSQL
        database using the credentials provided in the `creds` dictionary.
        
        '''
        url = f"postgresql://{creds['RDS_USER']}:{creds['RDS_PASSWORD']}@{creds['RDS_HOST']}:{creds['RDS_PORT']}/{creds['RDS_DATABASE']}"
        self.engine = create_engine(url)
        
        return self.engine
    
    def list_db_tables(self):
        '''This function inspects a database engine and prints out a list of table names.
        
        '''
        inspector = inspect(self.engine)
        table_names = inspector.get_table_names()
        print(table_names)
    
    def read_rds_table(self, table_name: str) -> pd.DataFrame:
        '''This function reads a table from a database using the specified table name and returns it as a
        pandas DataFrame.
        
        Parameters
        ----------
        table_name : str
            The name of the table in the database that you want to read.
        
        Returns
        -------
            The function `read_rds_table` returns a pandas DataFrame containing the data from the specified
        table in the database.
        
        '''
        with database_extractor.engine.connect() as conn:
            df = pd.read_sql_table(table_name, con=conn)
            
        return df

if __name__ == '__main__':
    database_extractor = DataExtractor()
    creds_dict = database_extractor.read_db_creds("db_creds.yaml")
    db_engine = database_extractor.init_db_engine(creds_dict)
    #table_list = database_extractor.list_db_tables()
    user_table = database_extractor.read_rds_table("legacy_users")
    print(user_table.columns)

Index(['index', 'first_name', 'last_name', 'date_of_birth', 'company',
       'email_address', 'address', 'country', 'country_code', 'phone_number',
       'join_date', 'user_uuid'],
      dtype='object')


In [6]:
user_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15320 entries, 0 to 1249
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   first_name     15320 non-null  object
 1   last_name      15320 non-null  object
 2   date_of_birth  15320 non-null  object
 3   company        15320 non-null  object
 4   email_address  15320 non-null  object
 5   address        15320 non-null  object
 6   country        15320 non-null  object
 7   country_code   15320 non-null  object
 8   phone_number   15320 non-null  object
 9   join_date      15320 non-null  object
 10  user_uuid      15320 non-null  object
dtypes: object(11)
memory usage: 1.4+ MB


In [2]:
data_cleaner = DataCleaning()
user_table = data_cleaner.clean_user_data(user_table)

In [4]:
user_table.country_code.unique()

array(['DE', 'GB', 'US'], dtype=object)

In [28]:
#user_table.set_index("index", inplace=True)
user_table.sample(8)

Unnamed: 0_level_0,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10218,Tyler,Hughes,1990-12-27,"Brown, Lyons and Scott",thomas10@brown.org,"716 Stout Ferry, Port Maurice, VA 58120",United States,US,13080319991,2017-04-20,ee6fb2de-8ef1-4865-86e6-a3a9b9a214ff
312,Cameron,Fisher,1987-04-06,"Henderson, Roberts and Williams",alexcollier@young-douglas.info,"6 Reece points, Trevorhaven, N72 3EJ",United Kingdom,GB,442074960534,1994-01-15,b069aa6d-0957-4539-99f7-5462023351a2
2956,Dan,Scheuermann,1943-08-10,Häring Käster AG,mira34@thanel.com,"Peter-Scheuermann-Platz 4/3, 05849 Schwabmünchen",Germany,DE,49984323613,2001-05-22,24f2b7db-d9c4-49db-8ee8-3d814a1fe041
4552,Lorraine,Sykes,1939-01-19,White Ltd,kennethtaylor@davies.com,"Flat 81, Lawson shoal, Loweland, W3E 4YE",United Kingdom,GB,442074960228,2007-04-02,8c5eae60-f534-4643-afb5-703aa4bc6b64
3891,Marcus,Stephens,1942-07-02,"Bradshaw, Stephens and Morris",dean08@duffy.biz,"5 Thomson landing, North Seanland, L1A 2QX",United Kingdom,GB,441144960720,2003-07-01,c6bbdd14-7fa6-4472-866e-48c2422d36a7
6839,Geraldine,Parker,1974-01-05,"Clark, Brown and Pearson",wilkinsfrederick@baxter.org,"Flat 75, Aaron bridge, Robertstown, KY52 5UL",United Kingdom,GB,443069990413,2019-05-08,889e252f-da21-4e41-9d9f-bb44955d1740
9547,Jimmy,Fox,1952-09-07,Wilson-Malone,john24@hernandez.com,"0008 James Spurs, Michaelbury, FM 18735",United States,US,11068423457,2006-10-13,59766cb4-71f7-41c0-bb9b-d716c51760a8
13516,Peter,Watson,1977-09-23,Newman and Sons,turnerross@williams.com,"471 Frank neck, West Lesleyhaven, M04 9BB",United Kingdom,GB,441164960065,2002-12-07,0d11f1f7-c36f-4503-ada9-f6f70dd6904a


In [6]:
user_table[user_table["phone_number"].str.contains("904", regex=True) & (user_table["country_code"] == "GB")]

Unnamed: 0,index,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
19,19,Jay,Miah,2003-06-10,Taylor and Sons,qcampbell@campbell.biz,78 Elliot flats\nNew Jenna\nS98 4TZ,United Kingdom,GB,+44306 9990447,2007-07-07,649b54e4-011e-43e1-b0f7-e90f355f806e
174,174,Harriet,Gilbert,1965-07-05,Bibi-Smith,geraldinekemp@atkinson.net,4 Webster corner\nWest Annetteburgh\nB7S 8GE,United Kingdom,GB,+44(0)9098790419,2015-04-26,c0442326-009d-4a06-9bb9-ffc5cedf2894
690,690,Joanne,Stewart,1972-02-15,McDonald Ltd,cbarnes@briggs.com,33 Toby prairie\nNorth Dawn\nSR0 6GZ,United Kingdom,GB,+44(0)306 9990466,1994-02-11,d9a39eb6-457e-4ed5-9efb-f6206d5ed65a
862,863,Neil,Clark,1985-07-29,Marshall-Moss,sian75@fleming.co.uk,33 Harvey ramp\nLake Damienstad\nB6 4NX,United Kingdom,GB,+44306 9990496,2006-06-09,e8926c4d-e573-4acd-ab88-c9d4ecd61646
1485,1487,Lydia,Turner,1973-01-16,Hanson Group,adamslee@fleming-smith.info,0 Gardner knoll\nPort Bryan\nHA4 4GX,United Kingdom,GB,+44(0)306 9990453,2009-10-04,648adb4c-10d4-4974-b3f9-79576f1683a1
...,...,...,...,...,...,...,...,...,...,...,...,...
14830,14857,Garry,Jones,1973-11-01,King Inc,goodwinkatie@hall-murphy.com,4 Frank key\nSouth Damien\nEH2 1PL,United Kingdom,GB,+441174960904,2009-06-07,7a180251-3b4a-472f-aa83-0ee625f2c8a1
14852,14879,Pamela,Lucas,2005-11-25,Grant-Hyde,lambroger@jackson.com,8 Rachael neck\nHoughtonstad\nUB0A 3JE,United Kingdom,GB,01632 960 904,2001-10-03,2a85b9bb-1e8c-4081-b372-778c7316962a
14863,14890,Abigail,Turner,1977-06-01,Young-Smith,wallacemaureen@bartlett.com,55 Katie trace\nEast Callumtown\nE9G 8YB,United Kingdom,GB,+44117 496 0904,2018-12-28,c64d1f26-7ff0-41d9-9d62-74f57ccde5da
15054,15084,Kieran,Knight,1991-03-29,Palmer LLC,klittle@dixon.org,938 Dennis lodge\nEast Tinatown\nM8 0PB,United Kingdom,GB,+44(0)9098790463,2000-11-20,2bec22f6-4500-40c6-8bd4-8f20804dd97f


In [9]:
user_table.iloc[3187]

index                                                         3190
first_name                                                   Kevin
last_name                                                  O'Brien
date_of_birth                                           1955-03-22
company                                 Phillips, Oliver and Begum
email_address                               caroleball@cameron.com
address          Studio 5\nKatie ville\nPort Nathanchester\nKT5...
country                                             United Kingdom
country_code                                                    GB
phone_number                                          020 74960904
join_date                                               2006-07-12
user_uuid                     ef52d477-2383-45dc-bcea-d102c540ab71
Name: 3187, dtype: object

In [6]:
values = []
char = set()
for number in user_table["phone_number"]:
    for chars in number:
        if chars in "qwertyuioplkjhgfdsazxcvbnm!#$%&'()*.-+,/:;?@[\]^_`{|}~":
            values.append(number)
            char.add(chars)
            break
print(char)
#user_table[user_table['phone_number'].isin(values)]
#indices = user_table[user_table['first_name'].isin(values)].index
#user_table.drop(indices, inplace=True)

{'+', 'x', '('}


In [7]:
# check for nulls
user_table.isnull().sum()

first_name       0
last_name        0
date_of_birth    0
company          0
email_address    0
address          0
country          0
country_code     0
phone_number     0
join_date        0
user_uuid        0
dtype: int64