# PCA and K-Means Clustering

In this notebook, we'll use PCA to reduce the dimensionality of the data and (hopefully) get better results than just using K-Means.

## Imports and housekeeping

In [6]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings('ignore')

# Setting for full table display
pd.set_option('display.max_columns', None)

In [7]:
# Import data
df = pd.read_csv('data/feature_engineered_data.csv')

# print the first few rows to confirm the data has been loaded correctly
df.head()

Unnamed: 0,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Age,EducationOrdinalEncoded,Relationship_InRelationship,Relationship_NotInRelationship,Relationship_Undefined,ChildrenHome,RegencyInMonths,TotalSpent,DealSeeker,BuyerType_CatalogueBuyer,BuyerType_MixedBuyer,BuyerType_StoreBuyer,BuyerType_WebBuyer,VisitedWebsiteRecently,AcceptedOffer
0,58138,58,635,88,546,172,88,88,3,8,10,4,7,57,2,0,1,0,0,32,1617,1,1,0,0,0,1,1
1,46344,38,11,1,6,2,1,6,2,1,1,2,5,60,2,0,1,0,1,5,27,0,0,0,1,0,1,0
2,71613,26,426,49,127,111,21,42,1,8,2,10,4,49,2,1,0,0,0,16,776,0,0,0,1,0,1,0
3,26646,26,11,4,20,10,3,5,2,2,0,4,6,30,2,1,0,0,1,3,53,0,0,0,1,0,1,0
4,58293,94,173,43,118,46,27,15,5,5,3,6,5,33,5,1,0,0,1,11,422,1,0,0,1,0,1,0


In [8]:
# Copy data
df_use = df.copy()

## Standardise

In [9]:
# Scaling of continuous features
continuous_features = ['Income',
                       'Recency',
                       'MntWines',
                       'MntFruits',
                       'MntMeatProducts',
                       'MntFishProducts',
                       'MntSweetProducts',
                       'MntGoldProds',
                       'NumDealsPurchases',
                       'NumWebPurchases',
                       'NumCatalogPurchases',
                       'NumStorePurchases',
                       'NumWebVisitsMonth',
                       'Age',
                       'RegencyInMonths',
                       'TotalSpent']

# Separate binary features
continuous_data = df[continuous_features]
binary_ord_data = df.drop(continuous_features, axis=1)

# Standardize continuous features using RobustScaler
scaler_continuous = RobustScaler()
scaled_continuous_data = scaler_continuous.fit_transform(df_use[continuous_features])

# Combined data
final_scaled_data = np.concatenate([scaled_continuous_data, binary_ord_data], axis=1)

In [10]:
final_scaled_data

array([[ 0.20272663,  0.18      ,  0.96045786, ...,  0.        ,
         1.        ,  1.        ],
       [-0.15738451, -0.22      , -0.33818939, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.61416445, -0.46      ,  0.52549428, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.16739947,  0.84      ,  1.52861602, ...,  0.        ,
         1.        ,  1.        ],
       [ 0.54186132, -0.82      ,  0.52965661, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.04184605, -0.18      , -0.18626431, ...,  0.        ,
         1.        ,  1.        ]])

## PCA

In [11]:
# TODO: Research and implement PCA