In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Goal
Perform descriptive analytics to create a customer profile for each CardioGood Fitness treadmill product line.

### Data
The market research team at AdRight collected data on individuals who purchased a treadmill at a CardioGoodFitness retail store during the prior three months. The data are stored in the CardioGoodFitness.csv file. 

### Data variables
* Product: product purchased, TM195, TM498, or TM798
* Gender
* Age
* Education
* MaritalStatus:single or partnered
* Income: annual household income ($)
* Usage: average number of times the customer plans to use the treadmill each week
* Miles: average number of miles the customer expects to walk/run each week
* Fitness: self-rated fitness on an 1-to-5 scale, where 1 is poor shape and 5 is excellent shape

### Importing libraries

In [None]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme()

### Loading data

In [None]:
file = '/kaggle/input/cardiogoodfitness/CardioGoodFitness.csv'
fitdata = pd.read_csv(file)

### Basic EDA

In [None]:
fitdata.head()

In [None]:
fitdata.shape

In [None]:
fitdata.isnull().sum()

In [None]:
fitdata.info()

In [None]:
fitdata.describe(include = 'all')

In [None]:
# Add two range columns for Income and Age for the future analysis
# create a list of income range conditions
conditions = [
    (fitdata.Income <= 30000),
    (fitdata.Income > 30000) & (fitdata.Income <= 50000),
    (fitdata.Income > 50000) & (fitdata.Income <= 70000),
    (fitdata.Income > 70000)
    ]

# create a list of the values we want to assign for each condition
values = ['Low', 'Medium', 'MediumHigh', 'High']

# create a new column and use np.select to assign values to it using our lists as arguments
fitdata['IncomeRange'] = np.select(conditions, values)

# change the data type of IncomeRange to ordered category type
income_level = CategoricalDtype(categories=['Low', 'Medium', 'MediumHigh', 'High'], ordered=True)
fitdata['IncomeRange']  = fitdata['IncomeRange'].astype(income_level)

In [None]:
# create a list of age range conditions
conditions = [
    (fitdata.Age <= 19),
    (fitdata.Age > 19) & (fitdata.Age <= 29),
    (fitdata.Age > 29) & (fitdata.Age <= 39),
    (fitdata.Age > 39) & (fitdata.Age <= 49),
    (fitdata.Age > 49) & (fitdata.Age <= 59),
    (fitdata.Age > 59)
    ]

# create a list of the values we want to assign for each condition
values = ["0-19", "20-29", "30-39","40-49","50-59","60-"]

# create a new column and use np.select to assign values to it using our lists as arguments
fitdata['AgeGroup'] = np.select(conditions, values)

# change the data type of AgeGroup to ordered category type
age_group = CategoricalDtype(categories=["0-19", "20-29", "30-39","40-49","50-59","60-"], ordered=True)
fitdata['AgeGroup']  = fitdata['AgeGroup'].astype(age_group)

# display updated DataFrame
fitdata.head()


### EDA: Visualise Data

#### 1: Overview of distributions

In [None]:
fitdata.hist(figsize=(10,10))

#### 2. Examining each variable

In [None]:
# Gender analysis
sns.countplot(x="Gender", hue= "Product", data=fitdata)

In [None]:
# Age analysis
sns.countplot(y='Product', hue = 'AgeGroup', data=fitdata)

In [None]:
# MaritalStatus analysis
sns.countplot(x='MaritalStatus', hue = 'Product', data=fitdata)

TM195 seems having wider range of age groups compared to other two products, but need more data to evaluate.

In [None]:
# Education analysis
sns.catplot(x='Product', hue = 'Education', kind="count",data=fitdata)

In [None]:
# Aaverage number of miles the customer expects to walk/run each week for each product in each AgeGroup
sns.barplot(x="Product", y ="Miles", data=fitdata)

In [None]:
# plot income summary across different products
sns.boxplot(data=fitdata, x="Product", y="Income")

In [None]:
# plot income distributions across different products
sns.histplot(data=fitdata, x="Income", hue="Product", element="step")

TM195 and TM498 are popular among middle income customers. 

#### 3. Overview of correlation between variables

In [None]:
# Matrix analysis
matrix = fitdata.corr() # Calculate pairwise-correlation
mask = np.triu(np.ones_like(matrix, dtype=bool)) # Create a mask
cmap = sns.diverging_palette(240, 10, s=75, l=40,as_cmap=True, n=9, center="light") # Create custom palette

plt.figure(figsize=(9, 6))
sns.heatmap(matrix, mask=mask, center=0, annot=True,
             fmt='.2f', square=True, cmap=cmap)

#### 4. Which group purchased more products?

In [None]:
# CrossTable of IncomeRange, Fitness and Product
product_fitness_income_cross = pd.crosstab(fitdata.IncomeRange, fitdata.Fitness, values = fitdata.Product, aggfunc = "count", margins=True, margins_name="Total", normalize='all')
sns.heatmap(product_fitness_income_cross, cbar=False, cmap='BuGn', annot=True, fmt='.2f')

# Rotate tick marks for visibility
plt.yticks(rotation=0)

#### 5. How Education and Income relate to the decision of which model is bought?

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
sns.set_context("notebook")
hue_colors = {"TM195": "#808080",
              "TM498": "#00FF00",
              "TM798": "#FF8C00"}
sns.scatterplot(ax=ax, x="Education", y="Income", data=fitdata, hue="Product", style= "Product", y_jitter= True, palette = hue_colors)

In [None]:
sns.set_context("notebook")
hue_colors = {"TM195": "#808080",
              "TM498": "#00FF00",
              "TM798": "#FF8C00"}
sns.relplot(x="Age", y="Income", data=fitdata, hue="Product", col="Education", col_wrap=3, style= "Product", kind="scatter", palette = hue_colors)

### Put it all together: pairplot

Customers of TM195 and TM498 appear to be having similar features with minor differences which need to be examined more closely.
TM798 is more appealing to the highly educated group who has more expendable income and more conscious about their fit wellness.

In [None]:
sns.pairplot(fitdata,
             vars=["Income", "Education", "Fitness", "Miles"],
             hue='Product', palette='husl',
             plot_kws={'alpha': 0.5})

In [None]:
#Scatter plot using plt, adding jittrs to generate randome noise
"""
If we were using seaborn, we would use x_jitter or y_jitter parameters, 
but matplotlib does not have them. To solve this, 
we will use np.random.normal function which generates a normal distribution 
with a given mean and standard deviation. 
"""
# Jitter y axis
income_jitter = fitdata.Income + np.random.normal(0,2, size=len(fitdata.Miles))

#Create figure and axis objects
fig, ax = plt.subplots(figsize=(10,10))

# Create a scatterplot
ax.plot(fitdata.Age, income_jitter, marker="o", linestyle="", markersize=1.2, alpha=0.9)

# Labeling
ax.set(xlabel="Age", ylabel="Income", title="Age vs Income")

# Zoom in : pass [xmin, xmax, ymin, ymax]
#ax.axis([25000, 110000, 0, 400])
