In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import pygwalker as pyg

### Read the data into a dataframe

In [2]:
# read the data

df = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name=None, header=1)

In [3]:
print(df.keys())

dict_keys(['Title Sheet', 'Transactions', 'NewCustomerList', 'CustomerDemographic', 'CustomerAddress'])


###  Assigning sheets to dataframe objects

In [4]:
transactions_df = df['Transactions']
New_customers_df = df['NewCustomerList']
customer_demographic_df = df['CustomerDemographic']
customer_address_df = df['CustomerAddress']
title_docx = df['Title Sheet']

## Assess the datasets one after the other:

#### Transaction dataset assessment

In [9]:
transactions_df.sample(10)

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date
6950,6951,73,3242,2017-11-11,0.0,Approved,Solex,Standard,medium,medium,1945.43,333.18,41848.0
8163,8164,74,30,2017-12-19,1.0,Approved,WeareA2B,Standard,medium,medium,1228.07,400.91,33429.0
9035,9036,81,818,2017-10-03,1.0,Approved,Norco Bicycles,Standard,medium,small,586.45,521.94,33429.0
18063,18064,78,1041,2017-12-10,0.0,Approved,Giant Bicycles,Standard,medium,large,1765.3,709.48,38193.0
9135,9136,41,598,2017-04-06,0.0,Approved,Norco Bicycles,Standard,low,medium,958.74,748.9,38693.0
18113,18114,95,647,2017-06-25,0.0,Approved,Giant Bicycles,Standard,medium,large,569.56,528.43,37337.0
18234,18235,0,1674,2017-09-27,0.0,Approved,OHM Cycles,Road,high,large,12.01,7.21,39880.0
10191,10192,19,1370,2017-11-27,1.0,Approved,OHM Cycles,Road,high,large,12.01,7.21,39880.0
4145,4146,3,3268,2017-05-09,0.0,Approved,Trek Bicycles,Standard,medium,large,2091.47,388.92,41009.0
15894,15895,66,264,2017-01-26,1.0,Approved,Giant Bicycles,Road,low,small,590.26,525.33,40487.0


In [10]:
transactions_df.shape

(20000, 13)

There are 20000 entries in the dataset.

### Note that, we're going to catch the issues as we explore the dataset which will be documented at the bottom of this notebook.

In [11]:
# first check for null values

transactions_df.isnull().sum()

transaction_id               0
product_id                   0
customer_id                  0
transaction_date             0
online_order               360
order_status                 0
brand                      197
product_line               197
product_class              197
product_size               197
list_price                   0
standard_cost              197
product_first_sold_date    197
dtype: int64

there are null values in the dataset, which indicates an issue of completeness.

In [22]:
# check for duplicate transactions

transactions_df.duplicated().all() == False

True

No duplicate transactions.

In [23]:
transactions_df.product_first_sold_date.sample(5)

11367    42295.0
17738    34556.0
11489    39526.0
13796    42688.0
19217    42458.0
Name: product_first_sold_date, dtype: float64

Entries in product_first_sold_date column have to be of date data type but they're float. This raises an issues of validity

In [24]:
transactions_df.online_order.value_counts()

1.0    9829
0.0    9811
Name: online_order, dtype: int64

online_order column must be boolean data type but we have float. Floating points do not give a relevant meaning to the column name online_order hence it raises an issue of validity.

In [25]:
transactions_df.order_status.value_counts()

Approved     19821
Cancelled      179
Name: order_status, dtype: int64

In [26]:
transactions_df.columns

Index(['transaction_id', 'product_id', 'customer_id', 'transaction_date',
       'online_order', 'order_status', 'brand', 'product_line',
       'product_class', 'product_size', 'list_price', 'standard_cost',
       'product_first_sold_date'],
      dtype='object')

In [27]:
transactions_df.transaction_date.sample(10)

7554    2017-10-10
4808    2017-07-18
15401   2017-05-22
5780    2017-04-09
4564    2017-08-26
6153    2017-06-24
16905   2017-08-15
10411   2017-10-06
7650    2017-08-01
6595    2017-01-27
Name: transaction_date, dtype: datetime64[ns]

In [37]:
# check the range(max-min) in the transaction_date column to see if its <=3.

transactions_df.transaction_date.max() - transactions_df.transaction_date.min()

Timedelta('363 days 00:00:00')

As shown in the above cell, the range of transaction_date is not 3 months. The dataset consist of about one year of transaction data which needs to be reduced to 3 months by choosing a reference date.

In [38]:
# check customer_id and transaction_id for duplicate entries.


transactions_df.customer_id.duplicated().any()

True

There is an indication of duplicate entries in customer_id because a single customer can perform multiple transactions or can purchase many products from the shop.

In [66]:
transactions_df[['customer_id', 'transaction_id']].duplicated().any()

False

In [63]:
transactions_df[transactions_df.duplicated(subset=['customer_id', 'transaction_id'])]

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date


In [67]:
transactions_df[['transaction_id']].duplicated().any()

False

There are no duplicated transactions in the transactions dataset.

### New customers dataset assessment

In [68]:
customers_df = New_customers_df

In [69]:
customers_df.shape

(1000, 23)

In [70]:
customers_df.columns

Index(['first_name', 'last_name', 'gender',
       'past_3_years_bike_related_purchases', 'DOB', 'job_title',
       'job_industry_category', 'wealth_segment', 'deceased_indicator',
       'owns_car', 'tenure', 'address', 'postcode', 'state', 'country',
       'property_valuation', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18',
       'Unnamed: 19', 'Unnamed: 20', 'Rank', 'Value'],
      dtype='object')

In [71]:
customers_df.sample(20)

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,...,state,country,property_valuation,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Rank,Value
9,Barthel,Docket,Male,72,1985-08-02,Accounting Assistant IV,IT,Mass Customer,N,Yes,...,QLD,Australia,5,0.84,1.0500,1.050000,0.892500,10,10,1.640625
620,Lek,Pimblett,Male,88,1955-09-29,Product Engineer,Financial Services,Mass Customer,N,Yes,...,NSW,Australia,2,0.62,0.7750,0.968750,0.823437,620,620,0.750000
827,Cristie,Bence,Female,49,2000-04-17,Automation Specialist II,,High Net Worth,N,No,...,QLD,Australia,8,0.94,0.9400,0.940000,0.940000,828,828,0.580000
986,Consalve,Ballay,Male,41,1959-09-18,Web Developer I,IT,Mass Customer,N,Yes,...,NSW,Australia,8,0.63,0.7875,0.787500,0.669375,987,987,0.400000
806,Sloan,Pudney,Male,83,1964-11-10,Junior Executive,Financial Services,Affluent Customer,N,No,...,QLD,Australia,9,0.68,0.6800,0.850000,0.850000,804,804,0.595000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970,Mavra,Finan,Female,30,1967-08-23,Nurse,Property,Affluent Customer,N,Yes,...,NSW,Australia,9,0.87,1.0875,1.359375,1.359375,967,967,0.435625
46,Sheena,Kybbye,Female,14,1956-07-16,Paralegal,Financial Services,Affluent Customer,N,Yes,...,QLD,Australia,1,0.91,1.1375,1.137500,1.137500,46,46,1.407812
317,Cecelia,Cisar,Female,20,1985-03-26,,Property,Mass Customer,N,Yes,...,QLD,Australia,7,0.57,0.7125,0.712500,0.605625,312,312,1.020000
72,Kevina,Ferandez,Female,73,1999-09-21,Assistant Professor,Financial Services,High Net Worth,N,No,...,QLD,Australia,9,0.97,0.9700,1.212500,1.212500,72,72,1.350000


There are 1000 customers in the customers dataset

### Data quality Issues:
> Completenes:
    >1. There are null values in the online_order, brand, product_line, product_class, product_size, standard_cost, product_first_sold_date columns.
    >2
    
> Consistency:
    >1. List_price and standard_cost columns of some products are not consistent. We can have two products of the same size, class, brand and product_line with different list_price and different standard_cost. 
    
> Accuracy:
    >1. 
    
> Currency:
    >1. Transaction_date column contains data points that are not current,i.e., they are over three(3) months old. This can be resolved by getting the last three months of transaction data.
    >2. Customers dataset contains information about customers' purchases in the last three (3) years which is also irrelevant since we're considering the last three (3) months of data.
    
> Relevancy:
    >1. Product_class column is not relevant since there is also product_size columns. Both seem to have same format and interpretation therefore it is irrelevant to make use of both.
    >2. product_first_sold_date column is not relevant in a transaction database.
    
> Validity:
    >1. product_first_sold_date cloumn's data points are not valid date/datetime values.
    >2. online_order has invalid data types
    
> Uniquesness:
    >1. 
    
    
