In [None]:
import numpy as np # linear algebra
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.mode.chained_assignment = None  # default='warn'


# Reading Data

In [None]:
# read Input file
df_bike_buyers = pd.read_csv('/kaggle/input/bike-buyers/bike_buyers.csv')

print(df_bike_buyers.head(10))

**Generic Function to calculate zero and missing Values**

In [None]:
# Generic function to calculate missing values, zero values
def calcMissingValues(df):
    '''    
        This function is used to calculate : zero values, missing values, NA and returns a dataframe with the above calculated
        values. Pass a dataframe to this function.    
    '''
    
    # Calc zero values
    zero_vals = (df == 0.0).astype(int).sum(axis = 0)
    
    # Calc missing values
    missing_vals = df.isnull().sum()
    
    # Calc percentage of missing values
    missing_val_percent = (missing_vals / len(df)) * 100.0
    
    df_miss_val = pd.concat([zero_vals , missing_vals, missing_val_percent] , axis = 1)
    
    # rename dataframe
    df_miss_val = df_miss_val.rename(columns = {0: 'zero_vals' , 1: 'missing_vals' , 2: '%_missing_vals'})
    df_miss_val['data_type'] = df.dtypes
    
    print(df_miss_val)
    
    return df_miss_val

In [None]:
# Test the function calc_missing_values
missing_df = calcMissingValues(df_bike_buyers)

In [None]:
# Describe the data
print(df_bike_buyers.describe())

# Data Visualization and EDA

In [None]:
# Plot missing values
df_bike_buyers.isnull().sum().plot(kind = 'barh' , figsize = (10 , 8) , grid = 'True' , color = 'orange')
plt.title('Columns with count of missing values')
plt.show()

In [None]:
# Lets drop the rows that contain missing values
# As seeing from the columns, we observe that some of the row values have not been recorded
df_bike_buyers_clean = df_bike_buyers.dropna()

# We can also drop the ID column
df_bike_buyers_clean.drop('ID' , inplace = True, axis = 1)

# Check the difference of rows dropped
print('Rows dropped: {}'.format(df_bike_buyers.shape[0] - df_bike_buyers_clean.shape[0]))
print('Clean data shape: {}'.format(df_bike_buyers_clean.shape))

# Lets check if indeed the missing values have been dropped
clean_df_stat = calcMissingValues(df_bike_buyers_clean)

# We must not delete the zero values as they hold information relevant to the context of the problem
# Here a value of zero for children indicates either the person is single with no kids, or married with no kids


In [None]:
# Lets plot relationship and between marital status and bike purchase = yes

# Get unique values for marital status
print(df_bike_buyers_clean['Marital Status'].unique())

# Martial-Status --> Buying a bike (Yes)

df_bike_yes = df_bike_buyers_clean[df_bike_buyers_clean['Purchased Bike'] == 'Yes'].copy().reset_index()

#print(df_martial_bike)

marital_bike = df_bike_yes.groupby(['Marital Status'], as_index = False)['Purchased Bike'].count()

marital_bike.plot(kind = 'bar' , x = 'Marital Status' , y = 'Purchased Bike', figsize = (6 , 6) , color = 'red' , grid = True , width = 0.2 , rot = 0)
plt.tight_layout()
plt.show()

From the above plot we can see people who are single are more likely to purchase a bike. So we can consider *Marital Status* to be a key feature to predict bike purchases.

Next, let's look at the gender ratio of bike purchases

In [None]:
# Lets plot relationship and between gender and bike purchase = yes

# Get unique values for marital status
print(df_bike_buyers_clean['Gender'].unique())

df_gender_bike = df_bike_yes.groupby(['Gender'], as_index = False)['Purchased Bike'].count()

print(df_gender_bike)

df_gender_bike.plot(kind = 'bar' , x = 'Gender' , y = 'Purchased Bike', figsize = (6 , 6) , color = 'blue' , grid = True , width = 0.2 , rot = 0)
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()


There is only a slight difference between the genders when it comes to purchasing a bike, so we can safely consider *Gender* to be another key feature for prediction. We will observe next how the *income* is distributed for those who purchased bikes

In [None]:
# The max and min salary for those who purchased a bike
print('Max Salary: ${}'.format(df_bike_yes['Income'].max()), '  ', 'Min Salary: ${}'.format(df_bike_yes['Income'].min()))
print('Mean Salary: ${:.2f}'.format(df_bike_yes['Income'].mean()))

df_bike_yes['Income'].hist(bins = 10, color = 'red', alpha = 0.5 , figsize = (10 , 8) , grid = False)
plt.axvline(df_bike_yes['Income'].mean(), color = 'k', linestyle = 'dashed', linewidth = 2)
plt.title('Distribution of Income over bike purchases')
plt.show()

The average income is around $55K mark and the spread is consistent with few value towards the right tail.

In [None]:
# Plot distribution of age across purchases
print('Max Age: {} yrs'.format(df_bike_yes['Age'].max()), '  ', 'Min Age: {} yrs'.format(df_bike_yes['Age'].min()))
print('Mean Age: {:.0f} yrs'.format(df_bike_yes['Age'].mean()))

df_bike_yes['Age'].hist(bins = 10, color = 'red', alpha = 0.5 , figsize = (10 , 8) , grid = False)
plt.axvline(df_bike_yes['Age'].mean(), color = 'k', linestyle = 'dashed', linewidth = 2)
plt.title('Distribution of Age over bike purchases')
plt.show()

So people around 43 yrs tend to purchase bikes, is maintaining their health or fitness a key reason? Predictably the older aged groups tends to prefer biking less and younger aged groups < 20 have less bikes.

In [None]:
# Plot distribution of commute distance across purchases

commute_distance = df_bike_yes.groupby(['Commute Distance'], as_index = False)['Purchased Bike'].count()

print(commute_distance)
commute_distance.plot(kind = 'bar' , x = 'Commute Distance' , figsize = (10 , 8), y = 'Purchased Bike', color = 'yellow' , grid = False, rot = 0)
plt.title('Distance leading to bike purchases')
plt.tight_layout()
plt.show()


#df_bike_yes['Commute Distance'].hist(bins = 10, color = 'red', alpha = 0.5 , figsize = (10 , 8) , grid = False)
#plt.axvline(df_bike_yes['Commute Distance'].mean(), color = 'k', linestyle = 'dashed', linewidth = 2)
#plt.title('Distribution of Commute Distance over bike purchases')
#plt.show()

# Encoding Categorical Variables

Let's look at features which are categorical and investigate ways to encode them.

In [None]:
# Display all categorical variables in the dataset
print(df_bike_buyers_clean.select_dtypes(include = ['object']))

From the above columns, we see that for *Gender*, *Marital Status*, *Home Owner* and *Purchased Bike* (Target Variable) there is few unique values (2 or 3). We can encode them by the process of mapping which would be easier in this dataset.

In [None]:
# Map categorical values with numerical values
map_cat_cols = {'Marital Status': {'Married': 1, 'Single': 0},
               'Gender': {'Male': 0 , 'Female': 1},
                'Home Owner': {'Yes': 1, 'No': 0},
               'Purchased Bike': {'Yes': 1, 'No': 0}}

# Replace the categorical values with the mapped integer values
df_bike_buyers_clean = df_bike_buyers_clean.replace(map_cat_cols)
print(df_bike_buyers_clean.select_dtypes(include = ['object']))

print()
print(df_bike_buyers_clean.head(5))

In [None]:
# Lets check the unique values of other category columns
print(pd.concat([df_bike_buyers_clean['Education'], df_bike_buyers_clean['Occupation'], df_bike_buyers_clean['Commute Distance'], df_bike_buyers_clean['Region']]).unique())

We do not need the *Region* column and can be dropped, for the rest of the categorical columns, let's do One-hot encoding using pandas.

In [None]:
# Drop the region column
df_bike_clean = df_bike_buyers_clean.drop('Region' , axis = 1)

print(df_bike_clean.head(5))

# One Hot Encoding using pd.get_dummies()
# pd.get_dummies(obj_df, columns=["body_style", "drive_wheels"], prefix=["body", "drive"])
df_bike_clean = pd.get_dummies(df_bike_clean, columns=['Education', 'Occupation', 'Commute Distance'], prefix = ['edu', 'occ', 'dist'])

print()
print('After encoding...')
print()
print(df_bike_clean.head(5))

# Correlation Matrix

We will now plot a correlation matrix of features with the target variables. To plot a correlation matrix all the features must have numerical values.

In [None]:
print(df_bike_clean.corrwith(df_bike_clean['Purchased Bike']).sort_values(ascending = False))

corr = df_bike_clean.corr()

# Create mask for removing the upper triangle from the correlation heatmap
mask_upper = np.zeros_like(corr , dtype = np.bool)

cmap = sns.diverging_palette(220, 20, as_cmap=True)

mask_upper[np.triu_indices_from(mask_upper)] = True

plt.figure(figsize = (14 , 9))
sns.heatmap(corr, mask = mask_upper,  linewidths = 1, cmap = cmap, center = 0)
plt.title("Heatmap of Correlation Matrix")
plt.show()


