In [4]:
import psycopg2
from sqlalchemy import create_engine, inspect, text
from database_utils import DatabaseConnector

connector = DatabaseConnector('db_creds.yaml')


### Task One


In [6]:
alter_command = """
ALTER TABLE orders_table
    ALTER COLUMN date_uuid TYPE UUID USING (date_uuid::UUID),
    ALTER COLUMN user_uuid TYPE UUID USING (user_uuid::UUID),
    ALTER COLUMN card_number TYPE VARCHAR(255),
    ALTER COLUMN store_code TYPE VARCHAR(255),
    ALTER COLUMN product_code TYPE VARCHAR(255),
    ALTER COLUMN product_quantity TYPE SMALLINT USING (product_quantity::SMALLINT);
"""

connector.local_connection.execute(text(alter_command))

print("Column data types changed successfully.")

Column data types changed successfully.


### Task Two

In [7]:
alter_command = """
UPDATE dim_users
SET user_uuid = NULL
WHERE user_uuid::text !~ '-';

DELETE FROM dim_users WHERE user_uuid IS NULL;

ALTER TABLE dim_users
    ALTER COLUMN first_name TYPE VARCHAR(255),
    ALTER COLUMN last_name TYPE VARCHAR(255),
    ALTER COLUMN date_of_birth TYPE DATE USING (date_of_birth::DATE),
    ALTER COLUMN country_code TYPE VARCHAR(16), 
    ALTER COLUMN user_uuid TYPE UUID USING (user_uuid::UUID),
    ALTER COLUMN join_date TYPE DATE USING (join_date::DATE);


"""

connector.local_connection.execute(text(alter_command))

print("Column data types changed successfully.")

Column data types changed successfully.


### Task Three

In [8]:
alter_command = """ 
UPDATE dim_store_details SET latitude = COALESCE(latitude, lat);
ALTER TABLE dim_store_details DROP COLUMN lat;
"""

connector.local_connection.execute(text(alter_command))
connector.local_connection.commit()
print("Latitudes successfuly merged.")

Latitudes successfuly merged.


In [9]:
alter_command = """ 
UPDATE dim_store_details
SET staff_numbers = NULL
WHERE staff_numbers ~ '[^0-9]';

UPDATE dim_store_details
SET longitude = NULL
WHERE longitude !~ '^[-]?[0-9]+(\.[0-9]+)?$';

UPDATE dim_store_details
SET latitude = NULL
WHERE latitude !~ '^[-]?[0-9]+(\.[0-9]+)?$';

UPDATE dim_store_details
SET opening_date = NULL
WHERE opening_date !~ '^[0-9]{4}-[0-9]{2}-[0-9]{2}$';


"""

connector.local_connection.execute(text(alter_command))
connector.local_connection.commit()
print("Alphanumeric successfully removed")

Alphanumeric successfully removed


In [10]:
alter_command = """ 
ALTER TABLE dim_store_details
    ALTER COLUMN longitude TYPE FLOAT USING longitude::FLOAT,
    ALTER COLUMN locality TYPE VARCHAR(64),
    ALTER COLUMN store_code TYPE VARCHAR(255), 
    ALTER COLUMN staff_numbers TYPE SMALLINT USING staff_numbers::SMALLINT,
    ALTER COLUMN opening_date TYPE DATE USING opening_date::DATE,
    ALTER COLUMN store_type TYPE VARCHAR(255),
    ALTER COLUMN latitude TYPE FLOAT USING latitude::FLOAT,
    ALTER COLUMN country_code TYPE VARCHAR(255),
    ALTER COLUMN continent TYPE VARCHAR(255);

UPDATE dim_store_details
SET locality = 'N/A'
WHERE locality IS NULL;
"""

connector.local_connection.execute(text(alter_command))
connector.local_connection.commit()
print("Type successfully updated.")

Type successfully updated.


### TASK FOUR

In [11]:
alter_command = """
UPDATE dim_products
SET product_price = REPLACE(product_price, '£', '');

"""
connector.local_connection.execute(text(alter_command))
connector.local_connection.commit()
print("£ successfully removed")

£ successfully removed


In [12]:

alter_command = """
ALTER TABLE dim_products
ADD COLUMN weight_class VARCHAR(20);

UPDATE dim_products
SET weight_class = CASE
    WHEN weight_kg < 2 THEN 'Light'
    WHEN weight_kg >= 2 AND weight_kg < 40 THEN 'Mid_Sized'
    WHEN weight_kg >= 40 AND weight_kg < 140 THEN 'Heavy'
    WHEN weight_kg >= 140 THEN 'Truck_Required'
    ELSE 'Unknown'  -- Optional, to handle any cases not covered above
END;

"""
connector.local_connection.execute(text(alter_command))
connector.local_connection.commit()
print("categories successfully added")

categories successfully added


In [13]:

alter_command = """

ALTER TABLE dim_products
RENAME COLUMN removed TO still_available;

UPDATE dim_products
SET still_available = CASE
    WHEN still_available = 'Still_avaliable' THEN 'true'
    ELSE 'false'
END;


"""
connector.local_connection.execute(text(alter_command))
connector.local_connection.commit()
print("Categories successfully added")

Categories successfully added


In [14]:
alter_command = """
UPDATE dim_products
SET product_price = NULL
WHERE product_price !~ '^[-]?[0-9]+(\.[0-9]+)?$';


UPDATE dim_products
SET date_added = NULL
WHERE date_added !~ '^[0-9]{4}-[0-9]{2}-[0-9]{2}$';

UPDATE dim_products
SET uuid = NULL
WHERE uuid !~ '-';

"""
connector.local_connection.execute(text(alter_command))
connector.local_connection.commit()
print("Categories successfully added")

Categories successfully added


In [15]:
# ALTER COLUMN weight_kg TYPE FLOAT USING weight_kg::FLOAT,
alter_command = """

ALTER TABLE dim_products
    ALTER COLUMN product_price TYPE FLOAT USING product_price::FLOAT,
    ALTER COLUMN \"EAN\" TYPE VARCHAR(20),  
    ALTER COLUMN product_code TYPE VARCHAR(20),  
    ALTER COLUMN date_added TYPE DATE USING date_added::DATE,
    ALTER COLUMN uuid TYPE UUID USING uuid::UUID,
    ALTER COLUMN still_available TYPE BOOL USING still_available::BOOL,
    ALTER COLUMN weight_class TYPE VARCHAR(20);  

"""
connector.local_connection.execute(text(alter_command))
connector.local_connection.commit()
print("Categories successfully added") 

Categories successfully added


### TASK FIVE

In [16]:
alter_command = """
UPDATE dim_date_times
SET month = NULL
WHERE month !~ '^[-]?[0-9]+(\.[0-9]+)?$';

UPDATE dim_date_times
SET year = NULL
WHERE year !~ '^[-]?[0-9]+(\.[0-9]+)?$';

UPDATE dim_date_times
SET day = NULL
WHERE day !~ '^[-]?[0-9]+(\.[0-9]+)?$';

UPDATE dim_date_times
SET timestamp = NULL
WHERE timestamp !~ ':';

UPDATE dim_date_times
SET date_uuid = NULL
WHERE date_uuid !~ '-';

UPDATE dim_date_times
SET time_period = NULL
WHERE time_period !~ '^[A-Za-z]+$';

"""
connector.local_connection.execute(text(alter_command))
connector.local_connection.commit()
print("removed unwanted items")

removed unwanted items


In [17]:
# ALTER COLUMN weight_kg TYPE FLOAT USING weight_kg::FLOAT,
alter_command = """

ALTER TABLE dim_date_times
    ALTER COLUMN month TYPE VARCHAR(20),  
    ALTER COLUMN year TYPE VARCHAR(10),   
    ALTER COLUMN day TYPE VARCHAR(10),    
    ALTER COLUMN time_period TYPE VARCHAR(20),  
    ALTER COLUMN date_uuid TYPE UUID USING date_uuid::uuid;


"""
connector.local_connection.execute(text(alter_command))
connector.local_connection.commit()
print("Casting done successfully ") 

Casting done successfully 


### TASK SIX

In [18]:
alter_command = """
UPDATE dim_card_details
SET card_number = NULL
WHERE card_number !~ '^[-]?[0-9]+(\.[0-9]+)?$';



UPDATE dim_card_details
SET expiry_date = NULL
WHERE expiry_date !~ '/';

"""
connector.local_connection.execute(text(alter_command))
connector.local_connection.commit()
print("removed unwanted items")

removed unwanted items


In [19]:
# ALTER COLUMN weight_kg TYPE FLOAT USING weight_kg::FLOAT,
alter_command = """

ALTER TABLE dim_card_details
    ALTER COLUMN card_number TYPE VARCHAR(32),
    ALTER COLUMN expiry_date TYPE VARCHAR(8),
    ALTER COLUMN date_payment_confirmed TYPE DATE;



"""
connector.local_connection.execute(text(alter_command))
connector.local_connection.commit()
print("Casting done successfully ") 

Casting done successfully 


### TASK 7

In [20]:
alter_command = """
DELETE FROM dim_card_details WHERE card_number IS NULL;


ALTER TABLE dim_card_details
ADD PRIMARY KEY (card_number);

"""
connector.local_connection.execute(text(alter_command))
connector.local_connection.commit()
print("Primary key set successfully ") 


Primary key set successfully 


In [21]:
alter_command = """
DELETE FROM dim_date_times WHERE date_uuid IS NULL;


ALTER TABLE dim_date_times
ADD PRIMARY KEY (date_uuid);

"""
connector.local_connection.execute(text(alter_command))
connector.local_connection.commit()
print("Primary key set successfully ") 


Primary key set successfully 


In [22]:
alter_command = """
DELETE FROM dim_users WHERE user_uuid IS NULL;


ALTER TABLE dim_users
ADD PRIMARY KEY (user_uuid);

"""
connector.local_connection.execute(text(alter_command))
connector.local_connection.commit()
print("Primary key set successfully ") 


Primary key set successfully 


In [23]:
alter_command = """
DELETE FROM dim_store_details WHERE store_code IS NULL;
DELETE FROM dim_store_details WHERE store_code = 'NULL';

ALTER TABLE dim_store_details
ADD PRIMARY KEY (store_code);

"""
connector.local_connection.execute(text(alter_command))
connector.local_connection.commit()
print("Primary key set successfully ") 


Primary key set successfully 


In [24]:
alter_command = """
DELETE FROM dim_products WHERE product_code IS NULL;
DELETE FROM dim_products WHERE product_code = 'NULL';

ALTER TABLE dim_products
ADD PRIMARY KEY (product_code);

"""
connector.local_connection.execute(text(alter_command))
connector.local_connection.commit()
print("Primary key set successfully ") 


Primary key set successfully 


### TASK EIGHT

In [25]:

alter_command = """
ALTER TABLE orders_table 
    ALTER COLUMN date_uuid TYPE UUID USING date_uuid::uuid,
    ALTER COLUMN card_number TYPE VARCHAR(32);

ALTER TABLE orders_table
ADD CONSTRAINT fk_orders_users
FOREIGN KEY (user_uuid) REFERENCES dim_users(user_uuid),
ADD CONSTRAINT fk_orders_products
FOREIGN KEY (product_code) REFERENCES dim_products(product_code),
ADD CONSTRAINT fk_orders_stores
FOREIGN KEY (store_code) REFERENCES dim_store_details(store_code),
ADD CONSTRAINT fk_orders_dates
FOREIGN KEY (date_uuid) REFERENCES dim_date_times(date_uuid),
ADD CONSTRAINT fk_orders_cards
FOREIGN KEY (card_number) REFERENCES dim_card_details(card_number);


"""
connector.local_connection.execute(text(alter_command))
connector.local_connection.commit()
print("Primary key set successfully ") 


Primary key set successfully 


In [90]:
!pip install tabula-py

Collecting tabula-py


[notice] A new release of pip available: 22.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip



  Using cached tabula_py-2.9.0-py3-none-any.whl (12.0 MB)
Collecting distro
  Using cached distro-1.9.0-py3-none-any.whl (20 kB)
Installing collected packages: distro, tabula-py
Successfully installed distro-1.9.0 tabula-py-2.9.0


In [1]:
from data_extraction import DataExtractor
from data_cleaning import DataCleaning
from database_utils import DatabaseConnector

db_connector = DatabaseConnector('db_creds.yaml')
extractor = DataExtractor(db_connector)

In [2]:

#### extract the card data
card_data_df = extractor.retrieve_pdf_data('https://data-handling-public.s3.eu-west-1.amazonaws.com/card_details.pdf')

#### clean the cards data
data_cleaner = DataCleaning(card_data_df)
cleaned_data = data_cleaner.clean_card_data()
print(cleaned_data)



Error importing jpype dependencies. Fallback to subprocess.
No module named 'jpype'
Got stderr: Jan 30, 2024 10:34:44 AM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO: Your current java version is: 1.8.0_111
Jan 30, 2024 10:34:44 AM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO: To get higher rendering speed on old java 1.8 or 9 versions,
Jan 30, 2024 10:34:44 AM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   update to the latest 1.8 or 9 version (>= 1.8.0_191 or >= 9.0.4),
Jan 30, 2024 10:34:44 AM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   or
Jan 30, 2024 10:34:44 AM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   use the option -Dsun.java2d.cmm=sun.java2d.cmm.kcms.KcmsServiceProvider
Jan 30, 2024 10:34:44 AM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   or call System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider")



removing regex
            card_number expiry_date                card_provider  \
0        30060773296197       09/26  Diners Club / Carte Blanche   
1       349624180933183       10/23             American Express   
2      3529023891650490       06/23                 JCB 16 digit   
3       213142929492281       09/27                 JCB 15 digit   
4          502067329974       10/25                      Maestro   
...                 ...         ...                          ...   
15304   180036921556789       12/28                 JCB 15 digit   
15305   180018030448512       11/24                 JCB 15 digit   
15306  3569953313547220       04/24                 JCB 16 digit   
15307  4444521712606810       06/27                VISA 16 digit   
15308   372031786522735       02/30             American Express   

      date_payment_confirmed  
0                 2015-11-25  
1                 2001-06-18  
2                 2000-12-26  
3                 2011-02-12  
4            

In [3]:

# Finding rows where card_number contains '3554954842403828' (not an exact match)
contains_rows = card_data_df[card_data_df['card_number'].str.contains('6011036876440620', na=False)]

contains_rows



Unnamed: 0.1,card_number,expiry_date,card_provider,date_payment_confirmed,card_number expiry_date,Unnamed: 0


In [86]:
import pandas as pd

def clean_card_data(data):
    # Handle NULL values
    data = data[['card_number', 
                            'expiry_date', 
                            'card_provider',
                            'date_payment_confirmed']
                            ] 
    data['card_number'] = data['card_number'].astype('str')
    
    data = data.dropna(subset=['card_number'])
    data['card_number'] = data['card_number'].str.replace('^\\?+', '', regex=True)
    
    data['date_payment_confirmed'] = pd.to_datetime(data['date_payment_confirmed'], errors='coerce')


    return data

# Sample data
data = {
    'card_number': ['1234567890123456', '???9876543210987', None, '5555888899990000'],
    'expiry_date': ['12/22', '01/24', '03/23', '11/25'],
    'card_provider': ['Visa', 'MasterCard', 'Visa', 'American Express'],
    'date_payment_confirmed': ['2021-08-15', '2021-07-24', None, 'invalid_date']
}

# Creating DataFrame
dim_card_details = pd.DataFrame(data)

# Cleaning the data
cleaned_data = clean_card_data(card_data_df)
print(cleaned_data)


         card_number expiry_date                card_provider  \
0     30060773296197       09/26  Diners Club / Carte Blanche   
1    349624180933183       10/23             American Express   
2   3529023891650490       06/23                 JCB 16 digit   
3    213142929492281       09/27                 JCB 15 digit   
4       502067329974       10/25                      Maestro   
..               ...         ...                          ...   
14   180036921556789       12/28                 JCB 15 digit   
15   180018030448512       11/24                 JCB 15 digit   
16  3569953313547220       04/24                 JCB 16 digit   
17  4444521712606810       06/27                VISA 16 digit   
18   372031786522735       02/30             American Express   

   date_payment_confirmed  
0              2015-11-25  
1              2001-06-18  
2              2000-12-26  
3              2011-02-12  
4              1997-03-13  
..                    ...  
14             1997-06-

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['card_number'] = data['card_number'].astype('str')


In [None]:
import re

test_string = '??42258299852'
new_string = re.sub('^\\?\\?', '', test_string)

print(new_string)


42258299852
