**General Information Of Data**
* The customer segments data is included as a selection of 440 data points collected on data found from clients of a wholesale distributor in Lisbon, Portugal.

 **Features**
* Fresh: annual spending (m.u.) on fresh products (Continuous);
* Milk: annual spending (m.u.) on milk products (Continuous);
* Grocery: annual spending (m.u.) on grocery products (Continuous);
* Frozen: annual spending (m.u.) on frozen products (Continuous);
* Detergents_Paper: annual spending (m.u.) on detergents and paper products (Continuous);
* Delicatessen: annual spending (m.u.) on and delicatessen products (Continuous);
* Channel: {Hotel/Restaurant/Cafe - 1, Retail - 2} (Nominal)
* Region: {Lisbon - 1, Oporto - 2, or Other - 3} (Nominal)

In [None]:
#Important libraries
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
# Pretty display for notebooks
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans 

In [None]:
#Read_Data
data = pd.read_csv('../input/customer-segmentaion/customers.csv' , sep = ',' , encoding ='utf8')

In [None]:
def preprocessing (data):
    data['fresh']  = np.log(data['Fresh'])
    data['milk']   = np.log(data['Milk'])
    data['grocery']= np.log(data['Grocery'])
    data["frozen"] = np.log(data["Frozen"])
    data["detergents_Paper"] = np.log(data["Detergents_Paper"])
    data["delicatessen"]     = np.log(data["Delicatessen"])
    
    #Drop Columns after log transform
    data.drop(['Fresh' , 'Milk','Grocery' , 'Frozen' , 'Detergents_Paper' , 'Delicatessen'],axis=1 ,inplace = True)
    #convert to object to be more readable
    #Channel:{Hotel/Restaurant/Cafe - 1, Retail - 2}
    data['Channel'].replace({1:"h-r-c" , 2:"Retail"} , inplace=True)
    #Region:{Lisbon - 1, Oporto - 2, or Other - 3} 
    data['Region'].replace({1:"Lisbon" , 2:"Oporto" , 3:"Other Zone"}, inplace=True)  
    #We have transformed categorical columns to dummy.
    data= pd.concat([data, pd.get_dummies(data["Channel"], drop_first=True),pd.get_dummies(data["Region"])], axis=1)
    data.drop(columns=["Channel", "Region"], axis=1, inplace=True)
    #drop for Retail columns as the same column of h-r-c

    outliers_list = []
# For each feature find the data points with extreme high or low values
    for feature in data.keys():
        # Calculate Q1 (25th percentile of the data) for the given feature
        Q1 = np.percentile(data[feature], 25)
        # Calculate Q3 (75th percentile of the data) for the given feature
        Q3 = np.percentile(data[feature], 75)
        # Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
        step = (Q3 - Q1) * 1.5
        outliers = list(data[~((data[feature] >= Q1 - step) & (data[feature] <= Q3 + step))].index.values)
        outliers_list.extend(outliers)
    duplicate_outliers_list = list(set([x for x in outliers_list if outliers_list.count(x) >= 2]))
    # Remove the outliers
    outliers  = duplicate_outliers_list
    new_data = data.drop(data.index[outliers]).reset_index(drop = True)
#Before clustering, we transform features from original version to standardize version
#as after dummy for two columns has zero and ones 
#and another columns has data by milliones 

    scaler= StandardScaler()
    std_data= scaler.fit_transform(new_data)
    mean_vec = np.mean(std_data, axis=0)
    cov_mat = (std_data - mean_vec).T.dot((std_data - mean_vec)) / (std_data.shape[0]-1)    
    cov_mat = np.cov(std_data.T)
    eig_vals, eig_vecs = np.linalg.eig(cov_mat)
    tot = sum(eig_vals)
    var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
    cum_var_exp = np.cumsum(var_exp)
    # Apply PCA by fitting the new data with the same number of dimensions as features

#svd_solver auto , full , arpack
#n_component is number of new features
#n_component is 4 as important component
    pca = PCA(n_components=4, copy=True , svd_solver='full' , random_state=0 , iterated_power='auto' ,whiten = False)
    reduced_data = pca.fit_transform(std_data)
    # Create a DataFrame for the reduced data
    reduced_data = pd.DataFrame(reduced_data, columns = ['Dimension 1', 'Dimension 2','Dimension 3', 'Dimension 4'])
    
    return reduced_data


In [None]:
new_data = preprocessing(data)
new_data.head()

# Clustering Model

In [None]:
KMeanModel = KMeans(n_clusters= 3, init='k-means++' , random_state=33 , algorithm='auto')
#algorithm is auto , full or elkan
def train(data_pro):
    #Fitting Model
    KMeanModel.fit(data_pro)
    return 'train is Done successful:)'

#calling function
train()

In [None]:
def predict(data_pro):
    y_predict=KMeanModel.predict(data_pro)
    return y_predict
#call function
predict()

In [None]:
def evaluation (data_pro):
    labels  = KMeanModel.labels_
    silhouette_Score = silhouette_score(data_pro, labels)
    return silhouette_Score
#call function
evaluation()