In [1]:
# Importing packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

In [2]:
# Configuring default setting to display all the rows and columns
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [None]:
# loading dataset
transaction = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name='Transactions', header=1)
demographic = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name='CustomerDemographic', header=1)
address = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name='CustomerAddress', header=1)
New_customer = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name='NewCustomerList', header=1)

In [None]:
# visualise data set
transaction.head()

In [None]:
# visualise data set
demographic.head()

In [None]:
# visualise data set
address.head()

In [None]:
# visualise data set
New_customer.head()

## Merging all 3 dataset into one

In [None]:
df = pd.merge(demographic, transaction, how='inner', on='customer_id')
df = pd.merge(df, address, how= 'inner', on='customer_id')
df.head()

## Independent and dependent variables

In [None]:
# Separating target variable from independent variable  
target_columns = [var for var in df.columns if var not in New_customer.columns]
target_columns

In [None]:
# independent variables 
independent_attribute = [var for var in df.columns if var not in target_columns]
independent_attribute

## Missing values imputation

In [None]:
# Percentage of missing values in the independent variables 
df[independent_attribute].isnull().mean()*100

In [None]:
missing_values = [var for var in independent_attribute if df[var].isnull().any()]
missing_values

The missing values in the above variables can be imputed as they are independent variables.   
Let's see the correlation among independent varaibles containing missing values

In [None]:
sns.heatmap(df[independent_attribute].corr(), annot=True)

We not able to find any correlation between tenure columns and other independent columns.  
Let's feature extract age from DOB columns to see any casual relationship

In [None]:
df['birth_year'] = pd.DatetimeIndex(df.DOB).year
df['age'] = 2020-df['birth_year']
independent_attribute.append('age')
sns.heatmap(df[independent_attribute].corr(), annot=True)

We can see some relation between age and tenure

In [None]:
sns.scatterplot(df.age, df.tenure)

In [None]:
print('No of records missing in tenure',df[df.tenure.isnull()].shape[0])
print('No of records missing in age',df[df.age.isnull()].shape[0])
print('No of records missing in both age and tenure',df[df.tenure.isnull() & df.age.isnull()].shape[0])

In [None]:
# replacing missing values in age and tenure columns with 0
def fill_num(data, list):
    for var in list:
        data[var] = data[var].fillna(0)
        
fill_num(df, ['tenure', 'age'])

Since the percentage of missing data is high in columns job_title and job_industry_category. So we have to put an indicator of missing values so that we can preserve valuable information. 

In [None]:
# replacing missing values in job_title and job_industry_category columns with 'missing' label
def fill_cat(data, list):
    for var in list:
        data[var] = data[var].fillna('missing')
        
fill_cat(df, ['job_title', 'job_industry_category'])

In [None]:
# Percentage of missing values in the target variables 
df[target_columns].isnull().mean()*100

Now before deleting the records having missing values. We have to drop out default column because:  
1. The values are un explainable
2. The missing values contained is 7.27 percentage which is greater than missing data percentage present in DOB column.

In [None]:
df.drop('default', axis=1, inplace=True)
target_columns = target_columns[2:]
df.columns

In [None]:
# Droping records which is having missing values in target variables
df = df.dropna(axis=0, subset=target_columns)
df[target_columns].isnull().any()

## Feature engineering some of the target columns

In [None]:
target_columns = target_columns[2:]
target_columns

In [None]:
df['year'] = pd.DatetimeIndex(df.transaction_date).year
df['month'] = pd.DatetimeIndex(df.transaction_date).month
df['weekday'] = pd.DatetimeIndex(df.transaction_date).weekday

for var in ['year', 'month', 'weekday']:
    target_columns.append(var)
    
df[target_columns].head()

In [None]:
for var in ['year', 'month', 'weekday']:
    print(var, df[var].unique())

## Independent Numeric variables

In [None]:
num_indp_var = [var for var in independent_attribute if df[var].dtypes != 'object']
num_indp_var

In [None]:
# Distribution of numeric varaibles 
for var in ['past_3_years_bike_related_purchases', 'tenure', 'property_valuation', 'age']:
    sns.distplot(df[var], bins=30)
    plt.title(var)
    plt.show()

* property_valuation distribution is slightly negative skewed
* We have an outlier in age column and the distribution is positive skewed

In [None]:
# Boxplot of numeric varaibles 
for var in ['past_3_years_bike_related_purchases', 'tenure', 'property_valuation', 'age']:
    sns.boxplot(y = df[var])
    plt.title(var)
    plt.show()

As aspected, age column is showing an outlier.  
Need to deal with an outlier

In [None]:
df[df.age > 100]

In [None]:
df.groupby('property_valuation')['age'].mean()

In [None]:
age = df.groupby('property_valuation')['age'].mean()[8].round()
age

In [None]:
df['age'] = np.where(df['age'] == 177, age, df['age'])
sns.boxplot(y = df['age'])
plt.title('age')
plt.show()

In [None]:
target_columns

## Independent categorical  variables

In [None]:
cat_indp_var = [var for var in independent_attribute if df[var].dtypes == 'object']
cat_indp_var