**General Information Of Data**
* The customer segments data is included as a selection of 440 data points collected on data found from clients of a wholesale distributor in Lisbon, Portugal.

 **Features**
* Fresh: annual spending (m.u.) on fresh products (Continuous);
* Milk: annual spending (m.u.) on milk products (Continuous);
* Grocery: annual spending (m.u.) on grocery products (Continuous);
* Frozen: annual spending (m.u.) on frozen products (Continuous);
* Detergents_Paper: annual spending (m.u.) on detergents and paper products (Continuous);
* Delicatessen: annual spending (m.u.) on and delicatessen products (Continuous);
* Channel: {Hotel/Restaurant/Cafe - 1, Retail - 2} (Nominal)
* Region: {Lisbon - 1, Oporto - 2, or Other - 3} (Nominal)

In [None]:
#Important libraries
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
# Pretty display for notebooks
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [None]:
#Read_Data
data = pd.read_csv('../input/customer-segmentaion/customers.csv' , sep = ',' , encoding ='utf8')

In [None]:
#Show first 5 row from data
data.head()

In [None]:
data.info()

All data is integer data types contain 440 row and 8 columns as a Features

In [None]:
data.head()

In [None]:
#Show data nulls
data.isnull().sum().sum()
#No Null Data

# Exploratory Data Analaysis

In [None]:
#Data about Data
data.describe().style.background_gradient(cmap='Purples')

#-we found variation in data as standard deviation is different

#-mean is bigger than median for all features so data has a lot of outliers

#-Maximum is much bigger than mean 

**Outliers**

In [None]:
#Show outliers with boxplot
plt.figure(figsize = (15,8))
col_names = ["Fresh", "Milk", "Grocery", "Frozen", "Detergents_Paper", "Delicatessen"]
for i in range(6):
    plt.subplot(3,2,i+1)#3 number of row #2 number of columns
    sns.boxplot(x=data[col_names[i]], linewidth=2.5)
plt.show()

In [None]:
#Show outliers with histogram
plt.figure(figsize = (15,8))
col_names = ["Fresh", "Milk", "Grocery", "Frozen", "Detergents_Paper", "Delicatessen"]
for i in range(6):
    plt.subplot(3,2,i+1)#3 number of row #2 number of columns
    sns.histplot(data[col_names[i]])
plt.show()

From Histogram the distribution of each feature appears not normal

In [None]:
data.head()

logarithmic transformation

In [None]:
log_data = data.copy()

In [None]:
data.head()

In [None]:
log_data.head()

In [None]:
log_data['fresh']  = np.log(log_data['Fresh'])
log_data['milk']   = np.log(log_data['Milk'])
log_data['grocery']= np.log(log_data['Grocery'])
log_data["frozen"] = np.log(log_data["Frozen"])
log_data["detergents_Paper"] = np.log(log_data["Detergents_Paper"])
log_data["delicatessen"]     = np.log(log_data["Delicatessen"])
 
plt.figure(figsize = (13,8))
col_names_log= ["Fresh", "fresh", "Milk", "milk", "Grocery", "grocery",
                "Frozen", "frozen", "Detergents_Paper", "detergents_Paper",
                "Delicatessen", "delicatessen"]
for i in range(12):
    plt.subplot(6,2,i+1)
    sns.distplot(log_data[col_names_log[i]])
    plt.title(col_names_log[i])
plt.show()

After applying a natural logarithm scaling to the data, the distribution of each feature appears much more normal

In [None]:
data.head()

In [None]:
log_data.head()

In [None]:
#Drop Columns after log transform
log_data.drop(['Fresh' , 'Milk','Grocery' , 'Frozen' , 'Detergents_Paper' , 'Delicatessen'] 
          ,axis=1 ,inplace = True)

In [None]:
#show data after log
log_data.head()

In [None]:
log_data.describe().style.background_gradient(cmap='Purples')

distribution for data more normality after applying log

**Correlation Matrix**

In [None]:
log_data.corr().style.background_gradient(cmap='inferno')

In [None]:
data.head()

In [None]:
# Display the correlation heatmap for data before and after log
log_corr = log_data.corr()
corr  = data.corr()
f = plt.figure(figsize = (16,8))
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    ax1 = sns.heatmap(corr, annot=True, mask=mask, cbar_kws={'label': 'Before Log Normalization'})

mask2 = np.zeros_like(corr)
mask2[np.tril_indices_from(mask2)] = True
with sns.axes_style("white"):
    ax2 = sns.heatmap(log_corr, annot=True, mask=mask2, cmap="YlGnBu", cbar_kws={'label': 'After Log Normalization'})

-corr between grocery and milk 0.73 before log and 0.76 after log

-corr between detergents_paper and milk 0.66 before log and 0.68 after log

In [None]:
log_data['Region'].value_counts()

In [None]:
log_data['Channel'].value_counts()

In [None]:
#convert to object to be more readable
#Channel:{Hotel/Restaurant/Cafe - 1, Retail - 2}
log_data['Channel'].replace({1:"h-r-c" , 2:"Retail"} , inplace=True)
#Region:{Lisbon - 1, Oporto - 2, or Other - 3} 
log_data['Region'].replace({1:"Lisbon" , 2:"Oporto" , 3:"Other Zone"}, inplace=True)    


In [None]:
log_data['Region'].value_counts()

In [None]:
plt.figure(figsize=(18,6))
sns.countplot(data = log_data, x=log_data["Region"], hue=log_data["Channel"], palette="muted",
              order=["Lisbon","Oporto", "Other Zone"])
plt.title("REGİON")
plt.xlabel("Region")
plt.ylabel("Count")
plt.show()

Notice that wholesale is bigger than retail in all Region

In [None]:
sns.lmplot('detergents_Paper' , 'grocery' , data=log_data , hue='Region' , fit_reg=False , height=4)
plt.show()

In [None]:
sns.lmplot('detergents_Paper' , 'grocery' , data=log_data , hue='Channel' , fit_reg=False , height=5)
plt.show()

In [None]:
sns.lmplot('milk' , 'grocery' , data=log_data , hue='Region' , fit_reg=False , height=4)
plt.show()

In [None]:
sns.lmplot('milk' , 'grocery' , data=log_data , hue='Channel' , fit_reg=False , height=4)
plt.show()

In [None]:
plt.figure(figsize = (20,8))
sns.barplot(data=log_data, palette="Set1")

In [None]:
sns.pairplot(log_data , size=5) 

scatter between milk & paper, milk & grocery , and paper & grocery semi linearity

and high correlation between them

In [None]:
log_data.info()

In [None]:
log_data.head()

In [None]:
#log_data.drop(['Region' , 'Channel'] 
 #         ,axis=1 ,inplace = True)

# Feature Engineering

In [None]:
#We have transformed categorical columns to dummy.
log_data= pd.concat([log_data, pd.get_dummies(log_data["Channel"], drop_first=True),
                     pd.get_dummies(log_data["Region"])], axis=1)
log_data.drop(columns=["Channel", "Region"], axis=1, inplace=True)
log_data.head()  
#drop for Retail columns as the same column of h-r-c

In [None]:
#show last 5 row from data
log_data.tail()

In [None]:
plt.figure(figsize=(15,10))
sns.boxplot(x='variable', y='value', data=log_data.melt())
plt.show()

boxplot show outlier just for data from  fresh  to delicatessen as another data is descrete

In [None]:
 outliers_list = []
# For each feature find the data points with extreme high or low values
for feature in log_data.keys():
    
    # Calculate Q1 (25th percentile of the data) for the given feature
    Q1 = np.percentile(log_data[feature], 25)
    
    # Calculate Q3 (75th percentile of the data) for the given feature
    Q3 = np.percentile(log_data[feature], 75)
    
    # Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
    step = (Q3 - Q1) * 1.5
    
    # Display the outliers
    print("Data points considered outliers for the feature '{}':".format(feature))
    outliers = list(log_data[~((log_data[feature] >= Q1 - step) & (log_data[feature] <= Q3 + step))].index.values)
    display(log_data[~((log_data[feature] >= Q1 - step) & (log_data[feature] <= Q3 + step))])
    print('-=-=-=-=-=-=-=-=-=-=-=-=-------------------------------=-=-=-=-=-=-=-=-=-=-=-=')
    outliers_list.extend(outliers)
    
print("List of Outliers -> \n :{}".format(outliers_list))

In [None]:
duplicate_outliers_list = list(set([x for x in outliers_list if outliers_list.count(x) >= 2]))
duplicate_outliers_list.sort()
print("\nList of Common Outliers -> {}".format(duplicate_outliers_list))

**There were 13 data points ([65, 66, 75, 128, 154, 203, 218, 233, 264, 304, 305, 325, 338])
that were considered outliers for more than one feature. So, instead of removing all outliers (which would result in us losing a lot of information),
only outliers that occur for more than one feature should be removed.**

because having a row show up as multiple outliers can add to our confidence that it is truly an outlier.

In [None]:
# Remove the outliers
outliers  = duplicate_outliers_list

new_data = log_data.drop(log_data.index[outliers]).reset_index(drop = True)

In [None]:
new_data.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15,10))
sns.boxplot(x='variable', y='value', data=new_data.melt())
plt.show()

In [None]:
new_data.head()

# Standardizing

Whether to standardize the data prior to a PCA on the covariance matrix depends on the measurement scales of the original features. 

Since PCA yields a feature subspace that maximizes the variance along the axes, it makes sense to standardize the data.

In [None]:
#Before clustering, we transform features from original version to standardize version
#as after dummy for two columns has zero and ones 
#and another columns has data by milliones 

scaler= StandardScaler()
std_data= scaler.fit_transform(new_data)

# - Eigendecomposition - Computing Eigenvectors and Eigenvalues

The eigenvectors and eigenvalues of a covariance (or correlation) matrix represent the “core” of a PCA: The eigenvectors (principal components) determine the directions of the new feature space, and the eigenvalues determine their magnitude. In other words, the eigenvalues explain the variance of the data along the new feature axes.

# Covariance Matrix

The classic approach to PCA is to perform the eigendecomposition on the covariance matrix Σ, which is a d×d matrix where each element represents the covariance between two features. The covariance between two features is calculated as follows:


σjk=1n−1∑i=1n(xij−x¯j)(xik−x¯k).

We can summarize the calculation of the covariance matrix via the following matrix equation:

Σ=1n−1((X−x¯)T(X−x¯))

where x¯ is the mean vector x¯=1n∑i=1nxi.

The mean vector is a d-dimensional vector where each value in this vector represents the sample mean of a feature column in the dataset.



In [None]:
mean_vec = np.mean(std_data, axis=0)
cov_mat = (std_data - mean_vec).T.dot((std_data - mean_vec)) / (std_data.shape[0]-1)
print('Covariance matrix \n%s' %cov_mat)

The more verbose way above was  used for explaining , we could have used the numpy cov function:

In [None]:
print('NumPy covariance matrix: \n%s' %np.cov(std_data.T))

eigendecomposition on the covariance matrix

In [None]:
cov_mat = np.cov(std_data.T)

eig_vals, eig_vecs = np.linalg.eig(cov_mat)

print('Eigenvectors %s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)

In [None]:
u,s,v = np.linalg.svd(std_data.T)
u

# Correlation Matrix

Eigendecomposition of the standardized data based on the correlation matrix

In [None]:
cor_mat = np.corrcoef(std_data.T)

eig_vals, eig_vecs = np.linalg.eig(cor_mat)

print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)

# Sorting Eigenpairs

the common approach is to rank the eigenvalues from highest to lowest in order choose the top k eigenvectors.

In [None]:
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort(key=lambda x: x[0], reverse=True)

# Visually confirm that the list is correctly sorted by decreasing eigenvalues
print('Eigenvalues in descending order:')
for i in eig_pairs:
    print(i[0])

# Explained Variance
After sorting the eigenpairs,  “how many principal components are we going to choose for our new feature subspace?” A useful measure is the so-called “explained variance,” which can be calculated from the eigenvalues. The explained variance tells us how much information (variance) can be attributed to each of the principal components.

In [None]:
tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
cum_var_exp

In [None]:
new_data.head()

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(9, 6))

    plt.bar(range(10), var_exp, alpha=0.5, align='center',
            label='individual explained variance')
    #lamda
    plt.step(range(10), cum_var_exp, where='mid',
             label='cumulative explained variance')
    #ratio 
    plt.ylabel('Explained variance ratio')
    plt.xlabel('Principal components')
    plt.legend(loc='best')
    plt.tight_layout()

The plot above clearly shows that most of the variance (36.54882177% of the variance to be precise) can be explained by the first principal component alone.

The second principal component still bears some information (18%) 

the third principal components still bears some information(17%)

the fourth principal components still bears some information(12%)

the Fifth principal components still bears some information(11%)

While can be safely dropped another principal components without losing much information.

**Together, the first  principal components contain 94.5% of the information.**

PCA
we will find which compound combinations of features best describe customers.

**Implementation: PCA**

In [None]:
# Apply PCA by fitting the new data with the same number of dimensions as features
from sklearn.decomposition import PCA
#svd_solver auto , full , arpack
#n_component is number of new features
#n_component is 4 as important component
pca = PCA(n_components=4, copy=True , svd_solver='full' , random_state=0 ,
        iterated_power='auto' ,whiten = False)
reduced_data = pca.fit_transform(std_data)

In [None]:
reduced_data

In [None]:
# Create a DataFrame for the reduced data
reduced_data = pd.DataFrame(reduced_data, columns = ['Dimension 1', 'Dimension 2',
                                                     'Dimension 3', 'Dimension 4'])
                                                    

In [None]:
#show final data after preprocessing
reduced_data.head()

In [None]:
#show another information 
print("Explained Variance  => {}\n".format(pca.explained_variance_))
print("Explained Variance Ratio => {}\n".format(pca.explained_variance_ratio_))
print("Explained Variance Ratio cumsum => {}\n".format(pca.explained_variance_ratio_.cumsum()))
print("components_ => {}\n".format(pca.components_))

# Clustering Model

In [None]:
from sklearn.cluster import KMeans 
ilist = [] #list of inertias #sum of distance between data point and center of cluster
n=25 #number of clusters 
for i in range (1,n):
    KMeanModel = KMeans(n_clusters=i , init='k-means++' , random_state=33 , algorithm='auto')
    KMeanModel.fit(reduced_data)#Fitting Model
    ilist.append(KMeanModel.inertia_)

In [None]:
ilist

In [None]:
plt.plot(range(1,n) , ilist)
plt.title('Elbow Graph')
plt.xlabel('Clusters')
plt.ylabel('Inertias')
plt.show()

when clusters is 1 the inertias is high 

from cluster =8 to 25 the change of inertias is small 

so will choise number of cluster is (4,6) 

In [None]:
from sklearn.metrics import silhouette_score
score = []
for n in range(2,11):
    KMean = KMeans(n_clusters=n , init='k-means++' , random_state=33 , algorithm='auto')
    KMean.fit(reduced_data)
    result = KMean.labels_
    print(n ,"    " , silhouette_score(reduced_data , result))
    score.append(silhouette_score(reduced_data , result))

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(2,11) , score)
plt.title('*Elbow for # of cluster with  silhouette_score*')
plt.xlabel('Cluster')
plt.ylabel('silhouette_score')
plt.show()

from Elbow show that  silhouette_score is low with cluster =2 or 6

and High with 4

Then will choice #of cluster is 4 or 3 and show accuracy

In [None]:
KMeanModel = KMeans(n_clusters= 3, init='k-means++' , random_state=33 , algorithm='auto')
#algorithm is auto , full or elkan
#Fitting Model
KMeanModel.fit(reduced_data)
y_predict=KMeanModel.predict(reduced_data)
centers = KMeanModel.cluster_centers_
labels  = KMeanModel.labels_
inertial= KMeanModel.inertia_
iteration=KMeanModel.n_iter_  

In [None]:
silhouette_Score = silhouette_score(reduced_data , labels)
print('Silutescore Score for KMean :: ',silhouette_Score)

In [None]:
print('\n Centers of 3 clusters :: \n' , centers)
print('\n Labels is :: \n',labels)
print('\n Y_Predictions :: \n' , y_predict)
print('\n Inertial is :: ',inertial)
print('\n Iteration is :: ',iteration)


# Visualising the Clusters

In [None]:
#plot cluster size
plt.hist(y_predict)
plt.title("Sales Per Cluster")
plt.xlabel("Clusters")
plt.ylabel("Sales")
plt.show()

In [None]:
#convert data fram to np.array to avoid error
reduced_data = np.array(reduced_data) #that all
# Visualising the clusters for 2 dimantion
plt.scatter(reduced_data[y_predict == 0, 0], reduced_data[y_predict == 0, 1], s = 20, c = 'red', label = 'Cluster 1')
plt.scatter(reduced_data[y_predict == 1, 0], reduced_data[y_predict == 1, 1], s = 20, c = 'blue', label = 'Cluster 2')
plt.scatter(reduced_data[y_predict == 2, 0], reduced_data[y_predict == 2, 1], s = 20, c = 'green', label = 'Cluster 3')
#plt.scatter(reduced_data[y_predict == 3, 0], reduced_data[y_predict == 3, 1], s = 20, c = 'cyan', label = 'Cluster 4')
#plt.scatter(reduced_data[y_predict == 4, 0], reduced_data[y_predict == 4, 1], s = 30, c = 'magenta', label = 'Cluster 5')
#plt.scatter(reduced_data[y_predict == 5, 0], reduced_data[y_predict == 5, 1], s = 30, c = 'k', label = 'Cluster 6')

plt.scatter(KMeanModel.cluster_centers_[:, 0], KMeanModel.cluster_centers_[:, 1], s = 100, c = 'yellow', label = 'Centroids')
plt.title('Clusters of Customers')

plt.legend()
plt.show()


In [None]:
#reduced_data['Cluster'] = y_predict
#reduced_data.head()
# Now we can use supervised learning