In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime as dt
import matplotlib.pyplot as plt
import calendar


### Completeness: Are there missing values?

### Accuracy: Are there correct values?

### Consistency: Any contradictions?

### Currency: Are all values current?

### Relevancy: Data items with value meta-data?

### Validity: Are the data allowable values?

### Uniqueness: Any duplicated values?

In [83]:
# Importing the files and creating dataframes

transactions = pd.read_excel('KPMG_VI_New_raw_data.xlsx', sheet_name = 'Transactions')

newcustlist = pd.read_excel('KPMG_VI_New_raw_data.xlsx', sheet_name = 'NewCustomerList')

customerdemo = pd.read_excel('KPMG_VI_New_raw_data.xlsx', sheet_name = 'CustomerDemographic')

custaddress = pd.read_excel('KPMG_VI_New_raw_data.xlsx', sheet_name = 'CustomerAddress')



## Transactions dataset

In [3]:
# Information on data
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   transaction_id           20000 non-null  int64         
 1   product_id               20000 non-null  int64         
 2   customer_id              20000 non-null  int64         
 3   transaction_date         20000 non-null  datetime64[ns]
 4   online_order             19640 non-null  float64       
 5   order_status             20000 non-null  object        
 6   brand                    19803 non-null  object        
 7   product_line             19803 non-null  object        
 8   product_class            19803 non-null  object        
 9   product_size             19803 non-null  object        
 10  list_price               20000 non-null  float64       
 11  standard_cost            19803 non-null  float64       
 12  product_first_sold_date  19803 n

In [4]:
# Checking for null or missing values
missing_vals_count = transactions.isna().sum()
missing_vals_count

transaction_id               0
product_id                   0
customer_id                  0
transaction_date             0
online_order               360
order_status                 0
brand                      197
product_line               197
product_class              197
product_size               197
list_price                   0
standard_cost              197
product_first_sold_date    197
dtype: int64

In [5]:
# Percentage of missing values per feature
missing_vals_percentage = missing_vals_count/len(transactions)*100

missing_vals_percentage

transaction_id             0.000
product_id                 0.000
customer_id                0.000
transaction_date           0.000
online_order               1.800
order_status               0.000
brand                      0.985
product_line               0.985
product_class              0.985
product_size               0.985
list_price                 0.000
standard_cost              0.985
product_first_sold_date    0.985
dtype: float64

In [6]:
# Making it all look a bit nicer
missing_data = pd.concat([missing_vals_count, missing_vals_percentage], axis = 1)
missing_data.columns = ["total missing", "% missing"]
missing_data

Unnamed: 0,total missing,% missing
transaction_id,0,0.0
product_id,0,0.0
customer_id,0,0.0
transaction_date,0,0.0
online_order,360,1.8
order_status,0,0.0
brand,197,0.985
product_line,197,0.985
product_class,197,0.985
product_size,197,0.985


### So how do we work with these missing data points?
    - First we need to determine if they are categorical or continuous
    - Once determined we can fill with the mean, median, or mode, we can drop
      drop them all together if they don't affect the analysis, or we can 
      drop the entire variable if the % missing is too high.



In [7]:
# Calling the data frame to investigate 
transactions.head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date
0,1,2,2950,2017-02-25,0.0,Approved,Solex,Standard,medium,medium,71.49,53.62,41245.0
1,2,3,3120,2017-05-21,1.0,Approved,Trek Bicycles,Standard,medium,large,2091.47,388.92,41701.0
2,3,37,402,2017-10-16,0.0,Approved,OHM Cycles,Standard,low,medium,1793.43,248.82,36361.0
3,4,88,3135,2017-08-31,0.0,Approved,Norco Bicycles,Standard,medium,medium,1198.46,381.1,36145.0
4,5,78,787,2017-10-01,1.0,Approved,Giant Bicycles,Standard,medium,large,1765.3,709.48,42226.0


In [43]:
# All features with null values are categorical
# The percetage missing for all features is under 2% so I will drop all the null rows
transactions.dropna(axis = 0, inplace = True)

In [45]:
# Checking to make sure rows were dropped
transactions.isna().sum()

transaction_id             0
product_id                 0
customer_id                0
transaction_date           0
online_order               0
order_status               0
brand                      0
product_line               0
product_class              0
product_size               0
list_price                 0
standard_cost              0
product_first_sold_date    0
dtype: int64

#### Great! All null values have been dropped

In [47]:
# Converting online_order to boolean

transactions['online_order'] = transactions['online_order'].replace({0:False, 1:True})


In [49]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19445 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   transaction_id           19445 non-null  int64         
 1   product_id               19445 non-null  int64         
 2   customer_id              19445 non-null  int64         
 3   transaction_date         19445 non-null  datetime64[ns]
 4   online_order             19445 non-null  bool          
 5   order_status             19445 non-null  object        
 6   brand                    19445 non-null  object        
 7   product_line             19445 non-null  object        
 8   product_class            19445 non-null  object        
 9   product_size             19445 non-null  object        
 10  list_price               19445 non-null  float64       
 11  standard_cost            19445 non-null  float64       
 12  product_first_sold_date  19445 n

In [72]:
# The product_first_sold_day is in ordinal date. Converting to  datetime
transactions['product_first_sold_date'] = transactions['product_first_sold_date'].astype(int)


In [73]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19445 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   transaction_id           19445 non-null  int64         
 1   product_id               19445 non-null  int64         
 2   customer_id              19445 non-null  int64         
 3   transaction_date         19445 non-null  datetime64[ns]
 4   online_order             19445 non-null  bool          
 5   order_status             19445 non-null  object        
 6   brand                    19445 non-null  object        
 7   product_line             19445 non-null  object        
 8   product_class            19445 non-null  object        
 9   product_size             19445 non-null  object        
 10  list_price               19445 non-null  float64       
 11  standard_cost            19445 non-null  float64       
 12  product_first_sold_date  19445 n

In [76]:
transactions.head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date
0,1,2,2950,2017-02-25,False,Approved,Solex,Standard,medium,medium,71.49,53.62,41245
1,2,3,3120,2017-05-21,True,Approved,Trek Bicycles,Standard,medium,large,2091.47,388.92,41701
2,3,37,402,2017-10-16,False,Approved,OHM Cycles,Standard,low,medium,1793.43,248.82,36361
3,4,88,3135,2017-08-31,False,Approved,Norco Bicycles,Standard,medium,medium,1198.46,381.1,36145
4,5,78,787,2017-10-01,True,Approved,Giant Bicycles,Standard,medium,large,1765.3,709.48,42226


In [52]:
# Any duplicate values?
transactions[transactions.duplicated()]

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date


 No duplicates! Easy enough.