In [1]:
import yaml
import tabula
import requests
import calendar
import numpy as np
import re as regex
import pandas as pd
from datetime import datetime
from data_extraction import DataExtractor
from database_utils import DatabaseConnector
from data_cleaning import DataCleaning


In [2]:

card_df = DataExtractor.retrieve_pdf_data('https://data-handling-public.s3.eu-west-1.amazonaws.com/card_details.pdf')

In [3]:
card_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15309 entries, 0 to 8
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   card_number             15309 non-null  object
 1   expiry_date             15309 non-null  object
 2   card_provider           15309 non-null  object
 3   date_payment_confirmed  15309 non-null  object
dtypes: object(4)
memory usage: 598.0+ KB


In [4]:
card_df.head(5)

Unnamed: 0,card_number,expiry_date,card_provider,date_payment_confirmed
0,30060773296197,09/26,Diners Club / Carte Blanche,2015-11-25
1,349624180933183,10/23,American Express,2001-06-18
2,3529023891650490,06/23,JCB 16 digit,2000-12-26
3,213142929492281,09/27,JCB 15 digit,2011-02-12
4,502067329974,10/25,Maestro,1997-03-13


In [5]:
card_df['date_payment_confirmed'] = pd.to_datetime(card_df['date_payment_confirmed'], errors='coerce')

In [6]:
card_df = card_df.loc[card_df['date_payment_confirmed'].notna()]

In [7]:
card_df['expiry_date'].sort_values()

6     01/23
8     01/23
43    01/23
4     01/23
35    01/23
      ...  
24    12/31
49    12/31
41    12/31
15    12/31
49    12/31
Name: expiry_date, Length: 15284, dtype: object

In [8]:
card_df['expiry_date'] = pd.to_datetime(card_df['expiry_date'], format="%m/%y")

In [9]:
card_df['expiry_date'] = card_df['expiry_date'].apply(lambda x : datetime(x.year, x.month, (calendar.monthrange(x.year, x.month)[1])))
card_df['expiry_date'].value_counts()

2028-03-31    163
2029-12-31    163
2028-01-31    162
2025-05-31    152
2026-12-31    152
             ... 
2032-02-29    103
2023-02-28     94
2025-09-30     92
2032-11-30     78
2022-11-30     37
Name: expiry_date, Length: 121, dtype: int64

In [10]:
card_df['card_number'].describe()

count              15284
unique             15284
top       30060773296197
freq                   1
Name: card_number, dtype: int64

In [11]:
card_df['isString'] = card_df['card_number'].apply(lambda x : True if bool(regex.match('\?', str(x))) == True else False)

In [12]:
len(card_df.loc[card_df['isString'] == True])

26

In [13]:
card_df['isString'] = card_df['card_number'].apply(lambda x : True if bool(regex.search('\?', str(x))) == True else False)

In [14]:
len(card_df.loc[card_df['isString'] == True])

26

In [15]:
card_df.columns

Index(['card_number', 'expiry_date', 'card_provider', 'date_payment_confirmed',
       'isString'],
      dtype='object')

In [17]:
card_df['card_number'] = card_df['card_number'].replace({'\?' : ''}, regex=True)

In [18]:
card_df['card_number'].describe()

count              15284
unique             15284
top       30060773296197
freq                   1
Name: card_number, dtype: int64

In [19]:
card_df['card_number'] = pd.to_numeric(card_df['card_number'])

In [22]:
card_df.drop('isString', axis=1, inplace=True)
card_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15284 entries, 0 to 8
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   card_number             15284 non-null  int64         
 1   expiry_date             15284 non-null  datetime64[ns]
 2   card_provider           15284 non-null  object        
 3   date_payment_confirmed  15284 non-null  datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(1)
memory usage: 597.0+ KB
