## Objective:- To find out the factors on which Happiness is dependent upon and then performing cluster analysis using various algorithm


#### 1. ) We will load the Liberaries required to perform the analysis

In [182]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time                          # To time processes 
import warnings                      # To suppress warnings
import matplotlib.pyplot as plt      # For Graphics
import seaborn as sns
from sklearn import cluster, mixture # For clustering 
from sklearn.preprocessing import StandardScaler

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
%matplotlib inline
warnings.filterwarnings('ignore')

#### 2. ) We will load the data into a dataframe data structure and then we will take a peek at the data to see what all dependent variables are available in data

In [183]:
data = pd.read_csv("../input/2017.csv")
data.head()

#### 3. ) We will check the columns we have in our dataset , so that we can start studying relationship between columns before starting clustering

In [184]:
data.columns

#### 4. )   Basic Data Pre-processing
#### This is a important step.We will perform following tasks for data pre-procession on this data
1.  We will check if there is any Null or NAN in our data set
2. We will rename our column names to more appropriate name

In [185]:
data.isnull().any()


In [186]:
data=data.rename(columns={'Economy..GDP.per.Capita.':'Economy_GDP_Per_Capita','Health..Life.Expectancy.':'Health_Life_Expectancy','Trust..Government.Corruption.':'Trust_Government_Corruption','Happiness.Rank':'Happiness_Rank','Happiness.Score':'Happiness_Score'})

In [187]:
data.columns

#### 5. ) Now we drew coorelation plots between different columns of data to see relation ship between data sets which contributes to overall happiness score. we can Gather from this that there is strong relation between Family,Health and Economy.Freedom and Generosity are very less related to any of these factor or to each other.

In [188]:
    
df=data.loc[:, 'Economy_GDP_Per_Capita':'Trust_Government_Corruption']
f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(data=df.corr(),annot=True, linewidths=.5, fmt= '.1f',ax=ax)
plt.show()

#### 6. ) We will draw vertical bar plots corresponding to each country to compare different factors contributing to happiness visually 

In [189]:
data=data.sort_values('Happiness_Rank',ascending=False)
df_wh=data.filter(['Country','Economy_GDP_Per_Capita','Family','Health_Life_Expectancy','Freedom','Generosity','Trust_Government_Corruption'])
df_wh=df_wh.set_index('Country')
df_wh.plot.barh(stacked=True,  figsize=(10,28))

#### 7. ) Visualizing Global Happiness on Map

In [190]:
df_wh_map = data
df_wh_map.head()

In [191]:
df_wh_map = data.reset_index()
colorRange = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]

data = dict(type = 'choropleth', 
           locations = df_wh_map['Country'],
           locationmode = 'country names',
           z = df_wh_map['Happiness_Score'], 
            colorscale = colorRange,
           text = df_wh_map['Country'],
           
            marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            ) )
            
            ,
           colorbar = {'title':'Cluster Group'})

layout = dict(title = 'World Happiness Visualization on Map', 
             geo = dict(showframe = False, 
                       projection = {'type': 'Mercator'}))

choromap3 = go.Figure(data = [data], layout=layout)

iplot(choromap3)

#### 8. ) We will load world happiness data for all the three years and we will see how world hapiness have changed for last 2 years

In [192]:
data_2016 = pd.read_csv("../input/2015.csv")
data_2016=data_2016.rename(columns={'Economy (GDP per Capita)':'Economy_GDP_Per_Capita','Health (Life Expectancy)':'Health_Life_Expectancy','Trust (Government Corruption)':'Trust_Government_Corruption','Happiness Rank':'Happiness_Rank','Happiness Score':'Happiness_Score'})
data_2016=data_2016.sort_values('Happiness_Rank',ascending=False)

data_2017 = pd.read_csv("../input/2017.csv")
data_2017=data_2017.rename(columns={'Economy..GDP.per.Capita.':'Economy_GDP_Per_Capita','Health..Life.Expectancy.':'Health_Life_Expectancy','Trust..Government.Corruption.':'Trust_Government_Corruption','Happiness.Rank':'Happiness_Rank','Happiness.Score':'Happiness_Score'})
data_2017=data_2017.sort_values('Happiness_Rank',ascending=False)

#### 9.) We will plot data from two years to see how factors contributing to happiness have changes over the course of time. We will see that family factor has increased in last year

In [193]:
##We will use Plotly library to draw graphs
#import plotly.plotly as py
#import plotly.graph_objs as go
#fig = plt.figure()
#trace1 = Scatter(x = frame['Country'],y = frame['Happiness_Rank'],mode = 'lines+markers', name = 'lines+markers')
#data = [trace1]
#plt.show()

In [194]:
#import plotly.graph_objs as go
#from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
#init_notebook_mode(connected=True)

In [195]:
#from plotly import __version__
#from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
#from plotly.graph_objs import *
# For offline use
#import cufflinks as cf
#cf.go_offline()
#for frame in [data_2015,data_2016,data_2017]:
#    frame[['Family','Freedom']].iplot(kind='spread')

In [196]:
for frame in [data_2016,data_2017]:
    plot_family=sns.kdeplot(frame['Family'], shade=True)

plot_family.legend_.set_title("Year")
plot_family.set_title("Family")
# we will replace labels
new_labels = ['2016','2017']

for current_label_obj, label in zip(plot_family.legend_.texts, new_labels): 
    current_label_obj.set_text(label)

#### 10. ) Above we saw family factor has been increased in 2017 .Now we will draw plot for other factors contributing towards happiness

In [197]:
for frame in [data_2016,data_2017]:
    plot_1=sns.kdeplot(frame['Economy_GDP_Per_Capita'], shade=True)

plot_1.legend_.set_title("Year")
plot_1.set_title("Economy")
# we will replace labels
new_labels = ['2016','2017']
for current_label_obj_1,label in zip(plot_1.legend_.texts,new_labels): 
    current_label_obj_1.set_text(label)

In [198]:
for frame in [data_2016,data_2017]:
    plot_2=sns.kdeplot(frame['Health_Life_Expectancy'], shade=True)
plot_2.legend_.set_title("Year")
plot_2.set_title("Health_Life_Expectancy")
for current_label_obj_2,label in zip(plot_2.legend_.texts,new_labels): 
    current_label_obj_2.set_text(label)

In [199]:
for frame in [data_2016,data_2017]:
    plot_3=sns.kdeplot(frame['Trust_Government_Corruption'], shade=True)
plot_3.legend_.set_title("Year")
plot_3.set_title("Trust_Government_Corruption")
for current_label_obj_3,label in zip(plot_3.legend_.texts,new_labels): 
    current_label_obj_3.set_text(label)

In [200]:
for frame in [data_2016,data_2017]:
    plot_4=sns.kdeplot(frame['Freedom'], shade=True)
plot_4.legend_.set_title("Year")
plot_4.set_title("Freedom")
for current_label_obj_4,label in zip(plot_4.legend_.texts,new_labels): 
    current_label_obj_4.set_text(label)

#### We have seen above, Freedom, family,Economy and Trust in goverment have increased over the time.But Health expectancy have been decreased.

## What is Clustering ?
#### Clustering is divding data into groups of homogenous or similar data. So that data in same group is more similar to each other and very different from data in other groups. Each group of similar data is called cluster
#### 11 . ) We will now perform clustering of World Happiness Report data using different methods of clustering.There are following clustering algorithems and techniques available for clustering.

1. K-Means Clustering(Partitioning Based)
2. Spectral
3. Affinity Propagation
4. Mean Shift
5. DBSCAN(Density Based)
6. Mini Batch K-Means(Partitioning Based)
7. Birch(Hierarchical)
8. Gaussian Mixture Modeling

#### A .) K Means Clustering - K means clustering works by selecting centroids randomly and number of centroids are inputs to clustering algorithm.Once random centroids are selected then distance from each centroid for each observations are calculated and each observation data is allocated to a centroid to which distance of observation is minimum.

In [201]:
# Method for K means clustering
def kmeans_Clustering(data,numberOfClusters):
       #Initializing Kmeans.cluster object was imported from sklearn in begining.
       kmeans = cluster.KMeans(n_clusters=numberOfClusters)
       # Fitting the input data and getting the cluster labels
       cluster_labels = kmeans.fit_predict(data)
       # Getting the cluster centers
       cluster_centers = kmeans.cluster_centers_
       cluster_centers.shape
       return cluster_labels,cluster_centers

We will plots clusters of all the 6 dimensions with 2 dimensions in each 2 dimensional plot

In [202]:
#Plot the cluster
def plot_cluster(labels,centers,df_wh):
    #Getting number of columns
    numOfDimensions = df_wh.columns.size
    #Number of plots required for 6 dimension with 2 dimensions in each plot
    numberOfPlots = int(numOfDimensions/2)
    #Number of rows and columns for subplots
    fig,ax = plt.subplots(numberOfPlots,1, figsize=(10,10))
    for i,j in zip(range(0,numOfDimensions,2),range(0,numberOfPlots)):
         ax[j].scatter(df_wh.iloc[:, i], df_wh.iloc[:, i+1], c=labels, s=50, cmap='viridis')
         ax[j].scatter(centers[:,i], centers[:, i+1], c='black', s=200, alpha=0.5)
         #print(i)
    plt.subplots_adjust(bottom=-0.5, top=1.5)
    plt.show()

We will do clustering of data using K means method and we will create plots for clustered data

In [203]:
labels,centers = kmeans_Clustering(df_wh,3)     
plot_cluster(labels,centers,df_wh)

> #### B .) Spectral Clustering - Objective of spectral clustering is to cluster data that which is connected but not necessarily clustered within convex boundaries.Spectral clustering use the affnity matrix to detemine the connectivity of data for clustering

In [204]:
def spectral_Clustering(data,numberOfClusters):
       #Initializing cluster.cluster object was imported from sklearn in begining.
       spectral = cluster.SpectralClustering(n_clusters=numberOfClusters,affinity="nearest_neighbors",eigen_solver='arpack')
       # Fitting the input data and getting the cluster labels
       cluster_labels = spectral.fit_predict(data)
       # Getting the cluster centers
       return cluster_labels

In [205]:
#Plot the cluster without center
def plot_clusterWithoutCenter(labels,df_wh):
    #Getting number of columns
    numOfDimensions = df_wh.columns.size
    #Number of plots required for 6 dimension with 2 dimensions in each plot
    numberOfPlots = int(numOfDimensions/2)
    #Number of rows and columns for subplots
    fig,ax = plt.subplots(numberOfPlots,1, figsize=(10,10))
    for i,j in zip(range(0,numOfDimensions,2),range(0,numberOfPlots)):
         ax[j].scatter(df_wh.iloc[:, i], df_wh.iloc[:, i+1], c=labels, s=50, cmap='viridis')
    plt.subplots_adjust(bottom=-0.5, top=1.5)
    plt.show()

In [206]:
labels = spectral_Clustering(df_wh,3)    
plot_clusterWithoutCenter(labels,df_wh)

#### C.)  Affinity Propagation- Affinity Propagation works by setting up a factor graph that describes the objective function used to identify exemplars and cluster data. Each item in a dataset can be mapped into Euclidean space using feature values. Affinity propagation depends on a matrix containing Euclidean distances between data points. Since the matrix can quickly become quite large, we should be careful not to take up too much memory

In [207]:
 def affinityPropagation_Clustering(data):
        affinity_propagation =  cluster.AffinityPropagation(preference=-10,damping=0.5,affinity='euclidean')
        af = affinity_propagation.fit(data)
        cluster_centers_indices = af.cluster_centers_indices_
        labels = af.labels_
        n_clusters_ = len(cluster_centers_indices)
        #print(n_clusters_)
        return affinity_propagation.predict(data),n_clusters_,cluster_centers_indices

In [208]:
from itertools import cycle
plt.close('all')
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')

def drawAffinityCluster(labels,n_clusters_,cluster_centers_indices):
     #Getting number of columns
     numOfDimensions = df_wh.columns.size
     #Number of plots required for 6 dimension with 2 dimensions in each plot
     numberOfPlots = int(numOfDimensions/2)
     #Number of rows and columns for subplots
     fig,ax = plt.subplots(numberOfPlots,1, figsize=(10,10))
     for column,plot in zip(range(0,numOfDimensions,2),range(0,numberOfPlots)):
        for cluster, col in zip(range(n_clusters_), colors):
            # This will provide cluster center for both clusters for all dimensions
            cluster_center = df_wh.iloc[cluster_centers_indices[cluster],:]
            #This will plot cluster center for first 2 dimensions
            ax[plot].plot(cluster_center[column], cluster_center[column+1], 'o', markerfacecolor=col,
                 markeredgecolor='k', markersize=14)
            # we will traverse through cluster labels and if cluster label is equal cluster 0 then it will 
            # plot data point for tht and if cluster label is equal to cluster number 1(k) then it will plot cluster
            # 1 data
            for i in range(labels.size):
                if cluster==labels[i]:
                    ax[plot].plot(df_wh.iloc[i, column], df_wh.iloc[i, column+1], col + '.')
                    # This will draw affinity line between center and data point
                    ax[plot].plot([cluster_center[column], df_wh.iloc[i, column]], [cluster_center[column+1], df_wh.iloc[i, column+1]], col)

     plt.title('Estimated number of clusters: %d' % n_clusters_)
     plt.show()


In [209]:
labels,n_clusters_,cluster_centers_indices = affinityPropagation_Clustering(df_wh)    
drawAffinityCluster(labels,n_clusters_,cluster_centers_indices)

#### D.) Mean Shift:-Meanshift is a clustering algorithm that assigns the datapoints to the clusters iteratively by shifting points towards  highest density of datapoints .  Meanshift algorithm has applications in the field of image processing and computer vision.Mean shift exploits this KDE idea by imagining what the points would do if they all climbed up hill to the nearest peak on the KDE surface. It does so by iteratively shifting each point uphill until it reaches a peak.

In [210]:
  def meanshift_Cluster(data):
        meanShift = cluster.MeanShift(bandwidth=0.4,bin_seeding=True )
        labels = meanShift.fit_predict(data)
        labels_unique = np.unique(labels)
        n_clusters = len(labels_unique)
        cluster_centers = meanShift.cluster_centers_
        return  labels,cluster_centers,n_clusters
    

In [211]:
def plotMeanShift(labels,cluster_centers,n_clusters):
     #Getting number of columns
     numOfDimensions = df_wh.columns.size
     #Number of plots required for 6 dimension with 2 dimensions in each plot
     numberOfPlots = int(numOfDimensions/2)
     #Number of rows and columns for subplots
     fig,ax = plt.subplots(numberOfPlots,1, figsize=(10,10))
     for column,plot in zip(range(0,numOfDimensions,2),range(0,numberOfPlots)):
        for cluster, col in zip(range(n_clusters), colors):
            #This will plot cluster center for first 2 dimensions
            ax[plot].plot(cluster_centers[cluster,column], cluster_centers[cluster,column+1], 'o', markerfacecolor=col,
                 markeredgecolor='k', markersize=14)
            # we will traverse through cluster labels and if cluster label is equal cluster 0 then it will 
            # plot data point for tht and if cluster label is equal to cluster number 1(k) then it will plot cluster
            # 1 data
            for i in range(labels.size):
                if cluster==labels[i]:
                    ax[plot].plot(df_wh.iloc[i, column], df_wh.iloc[i, column+1], col + '.')
                    # This will draw affinity line between center and data point
                    #ax[plot].plot([cluster_centers[column], df_wh.iloc[i, column]], [cluster_centers[column+1], df_wh.iloc[i, column+1]], col)

     plt.title('Estimated number of clusters: %d' % n_clusters)
     plt.show()

In [212]:
labels,cluster_centers,n_clusters = meanshift_Cluster(df_wh)    
plotMeanShift(labels,cluster_centers,n_clusters)

#### E.) DBSCAN - DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is the most well-known density-based clustering algorithm.Mostly it is used to demonstrate how to reduce the size of a spatial data set of GPS latitude-longitude coordinates

In [213]:
def dbscan_Cluster(data):
    dbscan=cluster.DBSCAN(eps=0.3, min_samples=10)
    dbsclabels=dbscan.fit_predict(data)
    core_indices=dbscan.core_sample_indices_ 
    return dbsclabels,core_indices
    
dbsclabels,coreIndices=dbscan_Cluster(df_wh)
length=dbsclabels
print(dbsclabels)

 If you look very closely above, you’ll see that DBSCAN produced three groups (–1, 0, and 1).It shows only 17 instances of label – 1. That’s because it’s a two-cluster solution; the third group (–1) is noise (outliers)

In [214]:
def drawDbscanPlot(dbsclabels):
        #Getting number of columns
        numOfDimensions = df_wh.columns.size
        #Number of plots required for 6 dimension with 2 dimensions in each plot
        numberOfPlots = int(numOfDimensions/2)
        #Number of rows and columns for subplots
        colorsArray=['b','g','r']
        #print(type(dbsclabels))
        fig,ax = plt.subplots(numberOfPlots,1, figsize=(10,10))
        for column,pltnum in zip(range(0,numOfDimensions,2),range(0,numberOfPlots)):
            for i,label in enumerate(dbsclabels):
                colour=colorsArray[label]
                marker_size=12
                if label == -1:
                    #black color
                    colour=[0, 0, 0, 1]
                    marker_size=6
                ax[pltnum].plot(df_wh.iloc[i,column],df_wh.iloc[i,column+1],'o',markerfacecolor=colour,markeredgecolor='k', markersize=marker_size)

        plt.show()

In [215]:
drawDbscanPlot(dbsclabels)

#### F.  ) Mini Batch K-Means - Mini-batch KMeans is very useful in case of extremely large datasets and/or very high dimensional data which is often the case in text mining. One can switch to Mini-batch KMeans training while creating KMeans object as follows.Mini Batch K-means has been proposed as an alternative to the K-means algorithm for clustering massive datasets. The advantage of this algorithm is to reduce the computational cost by not using all the dataset each iteration but a subsample of a fixed size

In [216]:
# Method for Mini batch K means clustering
def minikmeans_Clustering(data,numberOfClusters):
       #Initializing Kmeans.cluster object was imported from sklearn in begining.
       minikmeans = cluster.MiniBatchKMeans(n_clusters=numberOfClusters, max_iter=100, batch_size=100)
       # Fitting the input data and getting the cluster labels
       cluster_labels = minikmeans.fit_predict(data)
       # Getting the cluster centers
       cluster_centers = minikmeans.cluster_centers_
       cluster_centers.shape
       return cluster_labels,cluster_centers

In [217]:
def plot_miniKMeans(mini_labels,mini_centers):
    numberOfClusters=np.unique(mini_labels).size
    #Getting number of columns
    numOfDimensions = df_wh.columns.size
    #number of rows
    numOfRows=len(df_wh.index)
    #Number of plots required for 6 dimension with 2 dimensions in each plot
    numberOfPlots = int(numOfDimensions/2)
    #Number of rows and columns for subplots
    colorsArray=['b','g','r']
    #print(type(dbsclabels))
    fig,ax = plt.subplots(numberOfPlots,1, figsize=(10,10))
    for column,pltnum in zip(range(0,numOfDimensions,2),range(0,numberOfPlots)):
        #For every dimension first we will plot the Kmean cluster center
        for row in range(0,numberOfClusters):
             #we need to select unique labels for 3 centers to have same colors as rest of the cluster data but with different marker
             colour=colorsArray[np.unique(mini_labels)[row]]
             ax[pltnum].plot(mini_centers[row,column],mini_centers[row,column+1],'o',markerfacecolor=colour,markeredgecolor=colour, markersize=12)
        #For Every dimension we will plot the cluster data
        for row in range(0,numOfRows):
             #for every row in every dimension we need to assign different color for cluster label assign to row, so we will select label number and which will select color using tht number
             colour=colorsArray[mini_labels[row]]
             ax[pltnum].plot(df_wh.iloc[row,column],df_wh.iloc[row,column+1],'.',markerfacecolor=colour,markeredgecolor=colour,markersize=4)

    plt.show()

In [218]:
mini_labels,mini_centers=minikmeans_Clustering(df_wh,3)
plot_miniKMeans(mini_labels,mini_centers)

####  G.) Birch-Balanced Iterative Reducing and Clustering using Hierarchies is hierarichal clustering algorithm. It constructs a tree data structure with the cluster centroids being read off the leaf. These can be either the final cluster centroids or can be provided as input to another clustering algorithm such as AgglomerativeClusterin
There are two key phases for Birch clustering-
1. Scans the database to build an in-memory tree
2. Applies clustering algorithm to cluster the leaf nodes
It builds a dendrogram called clustering feature tree (CF tree) while scanning the data set. The data is essentially lossy compressed to a set of Characteristic Feature nodes (CF Nodes). The CF Nodes have a number of subclusters called Characteristic Feature subclusters (CF Subclusters) and these CF Subclusters located in the non-terminal CF Nodes can have CF Nodes as children.

In [219]:
# Method for Birch clustering
def birch_Clustering(data,numberOfClusters):
       #Initializing Kmeans.cluster object was imported from sklearn in begining.
       birch_clust = cluster.Birch(n_clusters=numberOfClusters)
       # Fitting the input data and getting the cluster labels
       cluster_labels = birch_clust.fit_predict(data)
       # Getting the cluster centers
       cluster_centers = birch_clust.subcluster_centers_
      
       return cluster_labels,cluster_centers

In [220]:
def plot_birch(birch_labels,birch_centers): 
    numberOfClusters=np.unique(birch_labels).size
    #Getting number of columns
    numOfDimensions = df_wh.columns.size
    #number of rows
    numOfRows=len(df_wh.index)
    #Number of plots required for 6 dimension with 2 dimensions in each plot
    numberOfPlots = int(numOfDimensions/2)
    #Number of rows and columns for subplots
    colorsArray=['b','g','r']
    #print(type(dbsclabels))
    fig,ax = plt.subplots(numberOfPlots,1, figsize=(10,10))
    for column,pltnum in zip(range(0,numOfDimensions,2),range(0,numberOfPlots)):
        #For every dimension first we will plot the Kmean cluster center
        for row in range(0,numberOfClusters):
             #we need to select unique labels for 3 centers to have same colors as rest of the cluster data but with different marker
             colour=colorsArray[np.unique(birch_labels)[row]]
             ax[pltnum].plot(birch_centers[row,column],birch_centers[row,column+1],'o',markerfacecolor=colour,markeredgecolor=colour, markersize=12)
        #For Every dimension we will plot the cluster data
        for row in range(0,numOfRows):
             #for every row in every dimension we need to assign different color for cluster label assign to row, so we will select label number and which will select color using tht number
             colour=colorsArray[birch_labels[row]]
             ax[pltnum].plot(df_wh.iloc[row,column],df_wh.iloc[row,column+1],'x',markerfacecolor=colour,markeredgecolor=colour,markersize=4)

    plt.show()

In [221]:
birch_labels,birch_centers=birch_Clustering(df_wh,3)
plot_birch(birch_labels,birch_centers)

#### H. ) GMM -  A Gaussian mixture model (GMM) attempts to find a mixture of multi-dimensional Gaussian probability distributions that best model any input dataset.But because GMM contains a probabilistic model under the hood, it is also possible to find probabilistic cluster assignments.It can also draw confidence ellipsoids for multivariate models
https://jakevdp.github.io/PythonDataScienceHandbook/05.12-gaussian-mixtures.html

In [222]:
# Method for GMM clustering
def gmm_Clustering(data,numberOfComponents):
       #Initializing Kmeans.cluster object was imported from sklearn in begining.
       gmm_clust = mixture.GaussianMixture(n_components=numberOfComponents, covariance_type='full')
       # Fitting the input data and getting the cluster labels
       cluster_labels = gmm_clust.fit(data).predict(data)
       cmeans = gmm_clust.means_ 
       cvariance =  gmm_clust.covariances_ 
       return cluster_labels,cmeans,cvariance,gmm_clust

In [223]:
def plot_gmm(gmm_Labels,cluster_means,covariances,gmm): 
    numberOfClusters=np.unique(gmm_Labels).size
    #Getting number of columns
    numOfDimensions = df_wh.columns.size
    #number of rows
    numOfRows=len(df_wh.index)
    #Number of plots required for 6 dimension with 2 dimensions in each plot
    numberOfPlots = int(numOfDimensions/2)
    #Number of rows and columns for subplots
    colorsArray=['b','g','r','c','m','y']
    fig,ax = plt.subplots(numberOfPlots,1, figsize=(15,15))
    for column,pltnum in zip(range(0,numOfDimensions,2),range(0,numberOfPlots)):
        #For Every dimension we will plot the cluster data
        for row in range(0,numOfRows):
             #for every row in every dimension we need to assign different color for cluster label assign to row, so we will select label number and which will select color using tht number
             colour=colorsArray[gmm_Labels[row]]
             ax[pltnum].scatter(df_wh.iloc[row,column],df_wh.iloc[row,column+1],c=colour, s=40, cmap='viridis', zorder=2,alpha=0.8)
             w_factor = 0.2 / gmm.weights_.max()
    plt.show()

#### GMM contains a probabilistic model under the hood, it is also possible to find probabilistic cluster assignments—in Scikit-Learn this is done using the predict_proba method. This returns a matrix of size [n_samples, n_clusters] which measures the probability that any point belongs to the given cluster.
    1. Gaussian mixture model is very similar to k-means: it uses an expectation–maximization approach which qualitatively does the following:
    2. Choose starting guesses for the location and shape
            Repeat until converged:
                    E-step: for each point, find weights encoding the probability of membership in each cluster
                    M-step: for each cluster, update its location, normalization, and shape based on all data points, making use of the weights
  
  The result of this is that each cluster is associated not with a hard-edged sphere, but with a smooth Gaussian model

In [224]:
#Draw ellipses for GMM
from matplotlib.patches import Ellipse ## foor creating GMM ellipse
from scipy import linalg
import matplotlib as mpl

def draw_ellipse(mean, covariance, ax):
             v, w = linalg.eigh(covariance)
             v = 2. * np.sqrt(2.) * np.sqrt(v)
             u = w[0] / linalg.norm(w[0])
             angle = np.arctan(u[1] / u[0])
             angle = 180. * angle / np.pi  # convert to degrees
             ell = mpl.patches.Ellipse(mean, v[0], v[1], angle, color='b')
             ell.set_clip_box(ax.bbox)
             ell.set_alpha(0.5)
             ax.add_artist(ell)   

In [225]:
gmm_Labels,cluster_means,covariances,gmm=gmm_Clustering(df_wh,6)
plot_gmm(gmm_Labels,cluster_means,covariances,gmm)

#### Gaussian Mixture Clustering Visualization in global Map

In [226]:
df_wh_map = df_wh.reset_index()
colorRange = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]

data = dict(type = 'choropleth', 
           locations = df_wh_map['Country'],
           locationmode = 'country names',
           z = gmm_Labels, 
            colorscale = colorRange,
           text = df_wh_map['Country'],
           
            marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            ) )
            
            ,
           colorbar = {'title':'Cluster Group'})

layout = dict(title = 'Gaussian Mixture Clustering group Visualization', 
             geo = dict(showframe = False, 
                       projection = {'type': 'Mercator'}))

choromap3 = go.Figure(data = [data], layout=layout)

iplot(choromap3)