In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from matplotlib import style
style.use("ggplot")

In [None]:
df=pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')

In [None]:
df.head()

In [None]:
df.rename(columns={'CustomerID':'id','Gender':'gender','Age':'age','Annual Income (k$)':'income',
                   'Spending Score (1-100)':'score'},inplace=True)

In [None]:
df.head(1)

In [None]:
df.loc[0:2,['gender','age']]

In [None]:
df[['gender','age']].count()

In [None]:
df['age'].mean() #Mean Age

In [None]:
df['age'].median() #Median Age

In [None]:
df['age'].mode()

In [None]:
df['age'].min() #Minimum Age

In [None]:
df['age'].max() #Maximum Age

In [None]:
df.sort_values('age',ascending=True).iloc[0:2]

In [None]:
df.groupby('gender').count()['id']/200*100 #% of Female and Male customers

In [None]:
df.set_index('id',inplace=True)

In [None]:
plt.hist(df['age'],color='red', bins='auto', rwidth=0.5,label=True, histtype='bar')
plt.xlabel('Age')
plt.ylabel('Count')

In [None]:
plt.hist(df['income'],color='green', bins='auto', rwidth=0.5,label=True, histtype='bar')
plt.xlabel('Income ($K)')
plt.ylabel('Count')

In [None]:
df['income'].corr(df['score']) #Correlation between income and the score is pretty low

In [None]:
plt.scatter(df[df['gender']=='Female']['income'],df[df['gender']=='Female']['score'],c='red',alpha=0.5,label='Female')
plt.scatter(df[df['gender']=='Male']['income'],df[df['gender']=='Male']['score'],c='blue',alpha=0.5,label='Male')
plt.legend(shadow=True,facecolor='white',edgecolor='black')
plt.xlabel('Income ($K)')
plt.ylabel('Purchasing Score')

In [None]:
plt.scatter(df[df['gender']=='Female']['age'],df[df['gender']=='Female']['score'],c='red',alpha=0.5,label='Female')
plt.scatter(df[df['gender']=='Male']['age'],df[df['gender']=='Male']['score'],c='blue',alpha=0.5,label='Male')
plt.legend(shadow=True,facecolor='white',edgecolor='black')
plt.xlabel('Age')
plt.ylabel('Purchasing Score')

In [None]:
df.head(2)

In [None]:
X=df[['income','score']]

In [None]:
wcss=[]
for i in range(1,20):
    kmeans=KMeans(n_clusters=i,init='k-means++',random_state=1)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

plt.plot(range(1,20),wcss)

In [None]:
kmeans=KMeans(n_clusters=5,init='k-means++',random_state=1)
y=kmeans.fit_predict(X)

In [None]:
df['income_clusters']=y

In [None]:
for i in range(0,5):
    label='Cluster '+str(i)
    plt.scatter(df[df['income_clusters']==i]['income'],df[df['income_clusters']==i]['score'],label=label)
plt.xlabel('Income ($K)')
plt.ylabel('Score')
plt.legend(facecolor='white',shadow=True)

In [None]:
X1=df[['age','score']]

In [None]:
wcss=[]
for i in range(1,20):
    kmeans=KMeans(n_clusters=i,init='k-means++',random_state=1)
    kmeans.fit(X1)
    wcss.append(kmeans.inertia_)

plt.plot(range(1,20),wcss)

In [None]:
kmeans=KMeans(n_clusters=4,init='k-means++',random_state=1)
y=kmeans.fit_predict(X1)

In [None]:
df['age_clusters']=y

In [None]:
for i in range(0,4):
    label='Cluster '+str(i)
    plt.scatter(df[df['age_clusters']==i]['age'],df[df['age_clusters']==i]['score'],label=label)
plt.xlabel('Age')
plt.ylabel('Score')
plt.legend(facecolor='white',shadow=True)

In [None]:
X2=df[['age','income','score']]

In [None]:
wcss=[]
for i in range(1,20):
    kmeans=KMeans(n_clusters=i,init='k-means++',random_state=1)
    kmeans.fit(X2)
    wcss.append(kmeans.inertia_)

plt.plot(range(1,20),wcss)

In [None]:
kmeans=KMeans(n_clusters=6,init='k-means++',random_state=1)
y=kmeans.fit_predict(X2)

In [None]:
df['combined_clusters']=y

In [None]:
for i in range(0,6):
    label='Cluster '+str(i)
    plt.scatter(df[df['combined_clusters']==i]['age'],df[df['combined_clusters']==i]['income'],label=label)
plt.legend(facecolor='white',edgecolor='black',shadow=True)
plt.xlabel('Age')
plt.ylabel('Income')

In [None]:
df.groupby('combined_clusters').max()

In [None]:
print('From the above Table it is clear that Cluster 1 and Cluster 5 are the most valuable. The Age vs Income distribution of the same is given in the above scatter plot')

In [None]:
#Using Figure Object:
fig, ax =plt.subplots(figsize=(10,5))
sctr=ax.scatter(df['age'],df['income'],c=df['score'],cmap='RdYlGn')
plt.colorbar(sctr,ax=ax)
ax.set_xlabel('Age')
ax.set_ylabel('Income')

In [None]:
#For Female:
fig, [ax1,ax2] =plt.subplots(1,2,figsize=(16,5))
sctr1=ax1.scatter(df[df['gender']=='Female']['age'],df[df['gender']=='Female']['income'],
                c=df[df['gender']=='Female']['score'],cmap='RdYlGn')
plt.colorbar(sctr,ax=ax1)
ax1.set_xlabel('Age')
ax1.set_ylabel('Income')
ax1.set_title('Female Population')
ax1.set(xlim=(10,df['age'].max()+1),ylim=(0,df['income'].max()+5))

sctr2=ax2.scatter(df[df['gender']=='Male']['age'],df[df['gender']=='Male']['income'],
                c=df[df['gender']=='Male']['score'],cmap='RdYlGn')
plt.colorbar(sctr2,ax=ax2)
ax2.set_xlabel('Age')
ax2.set_ylabel('Income')
ax2.set_title('Male Population')
ax2.set(xlim=(10,df['age'].max()+1),ylim=(0,df['income'].max()+5))

print('The Scale on the right represents the Spending Score')