# Importing the necessary Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
init_notebook_mode(connected=True)  
import cufflinks as cf  
cf.go_offline() 
import os
df = pd.read_csv('../input/google_review_ratings.csv')
df.head()

# Performing Data Cleaning/Preprocessing Sequences
This involve steps like:-
1. Removal of Unnamed Column(s)
2. Column(s) Renaming
2. Checking the correct data type(s) of the respective column(s)
3. Finding empty instance(s) and filling them with suitable statistical aspect(s)

In [None]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [None]:
df.info()

In [None]:
Cols = [str(i) for i in range(1,25)]
Cols =['Category '+i for i in Cols]

In [None]:
for i in Cols:
    df[i] = pd.to_numeric(df[i],errors = 'coerce')

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df = df.fillna(df.mean())

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
New_cols = ['user_id', 'churches', 'resorts', 'beaches', 'parks', 'theatres', 'museums', 'malls', 'zoo', 'restaurants', 'pubs_bars', 'local_services', 'burger_pizza_shops', 'hotels_other_lodgings', 'juice_bars', 'art_galleries', 'dance_clubs', 'swimming_pools', 'gyms', 'bakeries', 'beauty_spas', 'cafes', 'view_points', 'monuments', 'gardens']
df.columns = New_cols

We are saving a copy of the dataset for Hierarhical clustering

In [None]:
x = df.copy()
new = x['user_id'].str.split(' ',n=2,expand=True)
x['user'] = new[0]
x['id'] = new[1]
x = x.drop(['user_id','user'],axis=1)
x.head()

# EDA

In [None]:
AvgR = df[New_cols[1:]].mean()
AvgR = AvgR.sort_values()
plt.figure(figsize=(10,7))
plt.barh(np.arange(len(New_cols[1:])), AvgR.values, align='center')
plt.yticks(np.arange(len(New_cols[1:])), AvgR.index)
plt.ylabel('Categories')
plt.xlabel('Average Rating')
plt.title('Average Rating for every Category')

Here is the Horizontal Bar Graph representing the Mean/Average aspect review of each Category

In [None]:
New_cols.remove('user_id')

In [None]:
df[New_cols].iplot(kind='box')

Here is the boxplot visualization of every Category

# Clustering
For the purpose of clustering we have decided to go with:-
1. KMean Clustering 
2. Hierarhical Clustering

Observe carefully how we carry the task, keeping every attribute(s) or feature(s) which gives us the best result

# KMean Clustering
We start with KMean Clustering, with determining the pefect number of clusters with the **Elbow Method** with Within-Cluster-Sum-of-Squares (WCSS), which is applied followes:-

In [None]:
vals = df.iloc[ :, 1:].values

from sklearn.cluster import KMeans
wcss = []
for ii in range( 1, 30 ):
    kmeans = KMeans(n_clusters=ii, init="k-means++", n_init=10, max_iter=300) 
    kmeans.fit_predict( vals )
    wcss.append( kmeans.inertia_ )
    
plt.plot( wcss, 'ro-', label="WCSS")
plt.title("Computing WCSS for KMeans++")
plt.xlabel("Number of clusters")
plt.ylabel("WCSS")
plt.show()

**Graph Description:** We observe from the above graph that, k's best value is 4 since the we observe the points where there is change in slope of the graph

In [None]:
X = df.drop(['user_id'],axis=1).values
Y = df['user_id'].values

In [None]:
km = KMeans(n_clusters=4, init="k-means++", n_init=10, max_iter=500) 
y_pred = kmeans.fit_predict(X)

In [None]:
df["Cluster"] = y_pred
cols = list(df.columns)
cols.remove("user_id")

sns.pairplot( df[cols], hue="Cluster")

**Graph Description:** This is how clustering is done keeping each and every attribute/column for the task

# Hierarchical Clustering
For the purpose of Hierarchical Clustering we decided to with **Ward** Linkage.Ward's minimum variance criterion minimizes the total within-cluster variance. This who we proceed:-

In [None]:
import scipy.cluster.hierarchy as sch
from sklearn.preprocessing import scale as s
from scipy.cluster.hierarchy import dendrogram, linkage

In [None]:
Z = sch.linkage(x,method='ward')
den = sch.dendrogram(Z)
plt.tick_params(
    axis='x',          
    which='both',      
    bottom=False,     
    top=False,         
    labelbottom=False) 
plt.title('Hierarchical Clustering')

**Graph Description:** By looking at the above dendrogram, we observe 3 distinct colors in the dendrogram, but this will not determine how many clusters are formed.

Given below is special function made to serve the purpose of drawing the line which cuts the generated dendrogram to determine the number of clusters and the dendrogram node(s) which are below the cutting line 

In [None]:
def fd(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.title('Hierarchical Clustering Dendrogram (truncated)')
        plt.xlabel('sample index or (cluster size)')
        plt.ylabel('distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'o', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center')
        if max_d:
            plt.axhline(y=max_d, c='k')
    return ddata

In [None]:
Z = linkage(x,method='ward')
fd(Z,leaf_rotation=90.,show_contracted=True,annotate_above=30000,max_d=80000)
plt.tick_params(
    axis='x',          
    which='both',      
    bottom=False,     
    top=False,         
    labelbottom=False) 

**Graph Description:** Following the main critera of the cutting the dendrogram appropriatly we discover that there are basically 2 clusters, also observed from the above Graph. Observing the height of each  dendrogram division we decided to go with 80000 where the line would be drawn and 30000 to determine the dendrogram nodes