In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import tabula

First, extract the card_details data from its source using the following code:

In [96]:
def retrieve_pdf_data(pdf_link):
    # Use tabula to extract tables from the PDF
    try:
        pdf_df = tabula.read_pdf(pdf_link, pages='all', multiple_tables=True)
            
        # Concatenate all tables into a single DataFrame
        pdf_df = pd.concat(pdf_df, ignore_index=True)
            
        return pdf_df
    except Exception as e:
        print(f"Error extracting data from PDF: {e}")
        return None

# Provide the PDF link as an argument to the retrieve_pdf_data method
card_data = retrieve_pdf_data('https://data-handling-public.s3.eu-west-1.amazonaws.com/card_details.pdf')

In [90]:
card_data.head()

Unnamed: 0,card_number,expiry_date,card_provider,date_payment_confirmed
0,30060773296197,09/26,Diners Club / Carte Blanche,2015-11-25
1,349624180933183,10/23,American Express,2001-06-18
2,3529023891650490,06/23,JCB 16 digit,2000-12-26
3,213142929492281,09/27,JCB 15 digit,2011-02-12
4,502067329974,10/25,Maestro,1997-03-13


.head() allows me to quickly view the data columns and the data that may be available.

In [91]:
card_data.describe(include='all')

Unnamed: 0,card_number,expiry_date,card_provider,date_payment_confirmed
count,15309.0,15309,15309,15309.0
unique,15299.0,136,25,8270.0
top,,03/28,VISA 16 digit,
freq,11.0,163,2426,11.0


In [92]:
card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15309 entries, 0 to 15308
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   card_number             15309 non-null  object
 1   expiry_date             15309 non-null  object
 2   card_provider           15309 non-null  object
 3   date_payment_confirmed  15309 non-null  object
dtypes: object(4)
memory usage: 478.5+ KB


In [93]:
# Create a boolean mask for non-numeric values in the specified column
non_numeric_mask = ~pd.to_numeric(card_data['card_number'], errors='coerce').notna()

# Use the mask to display the rows where the specified column has non-numeric values
rows_with_non_numeric_values = card_data[non_numeric_mask]

# Display the rows where the specified column has non-numeric values
print(rows_with_non_numeric_values)

                  card_number expiry_date                card_provider  \
149         ?4971858637664481       04/24                VISA 16 digit   
157       ???3554954842403828       06/29                 JCB 16 digit   
377                      NULL        NULL                         NULL   
827                VAB9DSB8ZM  NWS3P2W38H                   NB71VBAHJE   
847                      NULL        NULL                         NULL   
884                      NULL        NULL                         NULL   
1443               MOZOT5Q95V  8YJ3TYH6Z5                   WJVMUO4QX6   
1777       ??4654492346226715       03/23                VISA 16 digit   
2418                     NULL        NULL                         NULL   
2489                     NULL        NULL                         NULL   
2830                     NULL        NULL                         NULL   
3143        ?3544855866042397       10/28                 JCB 16 digit   
3694               K0084A9R99  ACT9K6E

In [102]:
card_data["card_number"].astype(object)
card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15309 entries, 0 to 15308
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   card_number             15309 non-null  object
 1   expiry_date             15309 non-null  object
 2   card_provider           15309 non-null  object
 3   date_payment_confirmed  15309 non-null  object
dtypes: object(4)
memory usage: 478.5+ KB


In [105]:
# Create a boolean mask for non-numeric values in the specified column
non_numeric_mask = ~pd.to_numeric(card_data['card_number'], errors='coerce').notna()

# Use the mask to display the rows where the specified column has non-numeric values
rows_with_non_numeric_values = card_data[non_numeric_mask]

# Display the rows where the specified column has non-numeric values
print(rows_with_non_numeric_values)


      card_number expiry_date card_provider date_payment_confirmed
377          NULL        NULL          NULL                   NULL
827    VAB9DSB8ZM  NWS3P2W38H    NB71VBAHJE             GTC9KBWJO9
847          NULL        NULL          NULL                   NULL
884          NULL        NULL          NULL                   NULL
1443   MOZOT5Q95V  8YJ3TYH6Z5    WJVMUO4QX6             DJIXF1AFAZ
2418         NULL        NULL          NULL                   NULL
2489         NULL        NULL          NULL                   NULL
2830         NULL        NULL          NULL                   NULL
3694   K0084A9R99  ACT9K6ECRJ    JRPRLPIBZ2             H2PCQP4W50
4196         NULL        NULL          NULL                   NULL
4208   Y8ITI33X30  WDWMN9TU45    TS8A81WFXV             XTD27ANR5Q
4916   RNSCD8OCIM  VNLNMWPJII    JCQMU8FN85             7VGB4DA1WI
5686         NULL        NULL          NULL                   NULL
6024         NULL        NULL          NULL                   

In [106]:
card_data['card_number'] = card_data['card_number'].str.replace(r'\?', '', regex=True)

In [109]:
# Create a boolean mask for rows containing the text "NULL"
mask = card_data["card_number"] == 'NULL'

# Use the mask to drop rows with the text "NULL"
card_data = card_data.drop(card_data[mask].index)

In [110]:
# Create a boolean mask for non-numeric values in the specified column
non_numeric_mask = ~pd.to_numeric(card_data['card_number'], errors='coerce').notna()

# Use the mask to display the rows where the specified column has non-numeric values
rows_with_non_numeric_values = card_data[non_numeric_mask]

# Display the rows where the specified column has non-numeric values
print(rows_with_non_numeric_values)

      card_number expiry_date card_provider date_payment_confirmed
827    VAB9DSB8ZM  NWS3P2W38H    NB71VBAHJE             GTC9KBWJO9
1443   MOZOT5Q95V  8YJ3TYH6Z5    WJVMUO4QX6             DJIXF1AFAZ
3694   K0084A9R99  ACT9K6ECRJ    JRPRLPIBZ2             H2PCQP4W50
4208   Y8ITI33X30  WDWMN9TU45    TS8A81WFXV             XTD27ANR5Q
4916   RNSCD8OCIM  VNLNMWPJII    JCQMU8FN85             7VGB4DA1WI
6653   MIK9G2EMM0  4FI5GTUVYG    5CJH7ABGDR             RLQYRRYHPU
7332   I4PWLWSIRJ  RF1ACW165R    DE488ORDXY             T008RE1ZR6
7493   OMZSBN2XG3  6JJKS7R0WA    OGJTXI6X1H             7FL8EU9GBF
7818   NB8JJ05D7R  XRPE6C4GS9    1M38DYQTZV             GD9PHJXQR4
10457  G0EF4TS8C8  5VN8HOLMVE    DLWF2HANZF             WCK463ZO1Z
11345  Z8855EXTJX  Q7VGWP7LH9    XGZBYBYGUW             OE3KONN2V6
11465  JQTLQAAQTD  ZBGGFGY4H0    UA07L7EILH             T995FX2C7W
11499  T23BTBBJDD  UMR9FIE22M    BU9U947ZGV             EVVMMB3QYV
14884  LSWT9DT4G4  2ANT8LW3I5    5MFWFBZRM9             UZGSD0

In [111]:
# Create a boolean mask for non-numeric values in the specified column
non_numeric_mask = ~pd.to_numeric(card_data['card_number'], errors='coerce').notna()

# Use the mask to drop rows with non-numeric values in the 'card_number' column
card_data = card_data.loc[~non_numeric_mask]

In [112]:
# Create a boolean mask for non-numeric values in the specified column
non_numeric_mask = ~pd.to_numeric(card_data['card_number'], errors='coerce').notna()

# Use the mask to display the rows where the specified column has non-numeric values
rows_with_non_numeric_values = card_data[non_numeric_mask]

# Display the rows where the specified column has non-numeric values
print(rows_with_non_numeric_values)

Empty DataFrame
Columns: [card_number, expiry_date, card_provider, date_payment_confirmed]
Index: []


In [113]:
card_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15284 entries, 0 to 15308
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   card_number             15284 non-null  object
 1   expiry_date             15284 non-null  object
 2   card_provider           15284 non-null  object
 3   date_payment_confirmed  15284 non-null  object
dtypes: object(4)
memory usage: 597.0+ KB


In [116]:
card_data.isnull().sum()

card_number               0
expiry_date               0
card_provider             0
date_payment_confirmed    0
dtype: int64