In [None]:
# Import pandas and import data from csv
import pandas as pd

customer_data = pd.read_csv('customerdata4.csv')

In [None]:
# Look at the data headers to see what we can work with
customer_data.head()

In [None]:
# Get a brief summary of the data
customer_data.describe()

In [None]:
# Import seaborn for plotting
import seaborn as sns

# Analyze the age distribution grouped by gender
sns.boxplot(data=customer_data, x="Age", y="Gender")

In [None]:
# Analyze the income distribution grouped by gender
sns.boxplot(data=customer_data, x="Annual Income (10kkr)", y="Gender")

In [None]:
# Analyze the spending score grouped by gender
sns.boxplot(data=customer_data, x="Spending Score (1-100)", y="Gender")

In [None]:
# Import matplotlib for plotting and scipy to generate dendrogram
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.cluster.hierarchy as shc

# Create new data sets with only the data that we will analyze
data_age_score = customer_data.iloc[:, [2,4]].values
data_income_score = customer_data.iloc[:, [3,4]].values

# Dendrogram 1 - Age & Spending Score
plt.title("Customer dendogram with Age and Spending Score")
dend = shc.dendrogram(shc.linkage(data_age_score, method='ward'))

In [None]:
# Dendrogram 2 - Income & Spending Score
plt.title("Customer dendogram with Income and Spending Score")
dend = shc.dendrogram(shc.linkage(data_income_score, method='ward'))

In [None]:
# Import sklearn for clustering
from sklearn.cluster import KMeans

# k_list is the list of range in between we want to find clusters for
k_list = list(range(1, 11))

# SSE is the Sum of Squared Errors
sse = []

for k in k_list:
    # km_model is the KMeans where we define the model for fitting the data
    km_model = KMeans(n_clusters=k)
    
    # fitting the data (data is the data set) to km_model
    km_model.fit(data_age_score)
    sse.append(km_model.inertia_)

# Plot sse against k and find the value of k where it starts to flatten down and make angle like elbow.
plt.plot(k_list, sse, '-o')
plt.title('Find an optimal k-value with the elbow method\nData: Age and Spending Score')
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Sum of squared distance')

In [None]:
# k_list is the list of range in between we want to find clusters for
k_list = list(range(1, 11))

# SSE is the Sum of Squared Errors
sse = []

for k in k_list:
    # km_model is the KMeans where we define the model for fitting the data
    km_model = KMeans(n_clusters=k)
    
    # fitting the data (data is the data set) to km_model
    km_model.fit(data_income_score)
    sse.append(km_model.inertia_)

# Plot sse against k and find the value of k where it starts to flatten down and make angle like elbow.
plt.plot(k_list, sse, '-o')
plt.title('Find an optimal k-value with the elbow method\nData: Income and Spending Score')
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Sum of squared distance')

In [None]:
# Set the number of clusters
k = 4

# Run clustering on "Age & Spending Score" data set
#kmeans = KMeans(n_clusters=k).fit(data_age_score)
kmeans = KMeans(n_clusters=k, init = 'k-means++').fit(data_age_score)
centroids = kmeans.cluster_centers_

# Create a scatter plot with our results from the clustering
plt.scatter(data_age_score[:,1], data_age_score[:,0], c= kmeans.labels_.astype(float), s=50, alpha=0.5)
plt.scatter(centroids[:, 1], centroids[:, 0], c='red', s=40,marker='x')

plt.ylabel('Age')
plt.xlabel('Spending Score (1-100)')

plt.show()

In [None]:
# Set the number of clusters k
k = 5

# Run clustering on "Income & Spending Score" data set
#kmeans = KMeans(n_clusters=k).fit(data_income_score)
kmeans = KMeans(n_clusters=k, init = 'k-means++').fit(data_income_score)
centroids = kmeans.cluster_centers_

# Create a scatter plot with our results from the clustering
plt.scatter(data_income_score[:,1], data_income_score[:,0], c= kmeans.labels_.astype(float), s=50, alpha=0.5)
plt.scatter(centroids[:, 1], centroids[:, 0], c='red', s=40,marker='x')

# Print labels for the two axis
plt.ylabel('Annual Income (10kkr)')
plt.xlabel('Spending Score (1-100)')

plt.show()