# Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np

# Loading dataset

## Transactions Data

In [7]:
filepath = ".\KPMG_VI_New_raw_data_update_final.xlsx"
tran_df = pd.read_excel(filepath, sheet_name="Transactions", header=1)
tran_df.shape

(20000, 13)

In [8]:
tran_df.head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date
0,1,2,2950,2017-02-25,0.0,Approved,Solex,Standard,medium,medium,71.49,53.62,41245.0
1,2,3,3120,2017-05-21,1.0,Approved,Trek Bicycles,Standard,medium,large,2091.47,388.92,41701.0
2,3,37,402,2017-10-16,0.0,Approved,OHM Cycles,Standard,low,medium,1793.43,248.82,36361.0
3,4,88,3135,2017-08-31,0.0,Approved,Norco Bicycles,Standard,medium,medium,1198.46,381.1,36145.0
4,5,78,787,2017-10-01,1.0,Approved,Giant Bicycles,Standard,medium,large,1765.3,709.48,42226.0


In [9]:
tran_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   transaction_id           20000 non-null  int64         
 1   product_id               20000 non-null  int64         
 2   customer_id              20000 non-null  int64         
 3   transaction_date         20000 non-null  datetime64[ns]
 4   online_order             19640 non-null  float64       
 5   order_status             20000 non-null  object        
 6   brand                    19803 non-null  object        
 7   product_line             19803 non-null  object        
 8   product_class            19803 non-null  object        
 9   product_size             19803 non-null  object        
 10  list_price               20000 non-null  float64       
 11  standard_cost            19803 non-null  float64       
 12  product_first_sold_date  19803 n

The following issues are evident:
* Missing data in columns:
    online_order, brand, product_line, product_class, product_size, standard_cost, and product_first_sold_date,
* product_first_sold_date column has the wrong data type, date_value instead of datetime    

In [10]:
tran_df['order_status'].value_counts()

Approved     19821
Cancelled      179
Name: order_status, dtype: int64

In [11]:
tran_df['customer_id'].nunique()

3494

In [14]:
tran_df['brand'].value_counts()

Solex             4253
Giant Bicycles    3312
WeareA2B          3295
OHM Cycles        3043
Trek Bicycles     2990
Norco Bicycles    2910
Name: brand, dtype: int64

In [15]:
tran_df['product_line'].value_counts()

Standard    14176
Road         3970
Touring      1234
Mountain      423
Name: product_line, dtype: int64

In [16]:
tran_df['product_class'].value_counts()

medium    13826
high       3013
low        2964
Name: product_class, dtype: int64

In [17]:
tran_df['product_size'].value_counts()

medium    12990
large      3976
small      2837
Name: product_size, dtype: int64

There is a confusing choice for product_class and product_size which use the same categorical values.

Product specific data should be in its own table

## Customer Addresses

In [13]:
add_df = pd.read_excel(filepath, sheet_name="CustomerAddress", header=1)
add_df.head()

Unnamed: 0,customer_id,address,postcode,state,country,property_valuation
0,1,060 Morning Avenue,2016,New South Wales,Australia,10
1,2,6 Meadow Vale Court,2153,New South Wales,Australia,10
2,4,0 Holy Cross Court,4211,QLD,Australia,9
3,5,17979 Del Mar Point,2448,New South Wales,Australia,4
4,6,9 Oakridge Court,3216,VIC,Australia,9


In [18]:
add_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3999 entries, 0 to 3998
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_id         3999 non-null   int64 
 1   address             3999 non-null   object
 2   postcode            3999 non-null   int64 
 3   state               3999 non-null   object
 4   country             3999 non-null   object
 5   property_valuation  3999 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 187.6+ KB


No missing data for addresses

In [21]:
add_df.sample(10)

Unnamed: 0,customer_id,address,postcode,state,country,property_valuation
709,714,51 Londonderry Trail,2153,NSW,Australia,10
2581,2586,9106 Thompson Hill,2032,NSW,Australia,8
3225,3230,864 Scoville Plaza,3029,VIC,Australia,5
3389,3394,06 Northwestern Center,2019,NSW,Australia,10
414,419,7526 Artisan Parkway,4223,QLD,Australia,8
3540,3545,005 Bunker Hill Lane,3046,VIC,Australia,9
3822,3827,9240 Green Avenue,3810,VIC,Australia,5
2340,2345,4 Huxley Alley,4070,QLD,Australia,8
812,817,87 Twin Pines Plaza,3976,VIC,Australia,5
1010,1015,738 Division Street,3024,VIC,Australia,8


Some states are written in full while some are abbreviations

## Customer Demographic

In [25]:
demo_df = pd.read_excel(filepath, sheet_name="CustomerDemographic", header=1)
demo_df.head()

  demo_df = pd.read_excel(filepath, sheet_name="CustomerDemographic", header=1)


Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,default,owns_car,tenure
0,1,Laraine,Medendorp,F,93,1953-10-12,Executive Secretary,Health,Mass Customer,N,"""'",Yes,11.0
1,2,Eli,Bockman,Male,81,1980-12-16,Administrative Officer,Financial Services,Mass Customer,N,<script>alert('hi')</script>,Yes,16.0
2,3,Arlin,Dearle,Male,61,1954-01-20,Recruiting Manager,Property,Mass Customer,N,2018-02-01 00:00:00,Yes,15.0
3,4,Talbot,,Male,33,1961-10-03,,IT,Mass Customer,N,() { _; } >_[$($())] { touch /tmp/blns.shellsh...,No,7.0
4,5,Sheila-kathryn,Calton,Female,56,1977-05-13,Senior Editor,,Affluent Customer,N,NIL,Yes,8.0


In [26]:
demo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 13 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   customer_id                          4000 non-null   int64         
 1   first_name                           4000 non-null   object        
 2   last_name                            3875 non-null   object        
 3   gender                               4000 non-null   object        
 4   past_3_years_bike_related_purchases  4000 non-null   int64         
 5   DOB                                  3913 non-null   datetime64[ns]
 6   job_title                            3494 non-null   object        
 7   job_industry_category                3344 non-null   object        
 8   wealth_segment                       4000 non-null   object        
 9   deceased_indicator                   4000 non-null   object        
 10  default     

missing data in columns: last_name, DOB, job_title, job_industry_category, default, and tenure 