## Clustering Analysis

### Libraries & Dependencies

In [None]:
from sklearn.cluster import AgglomerativeClustering 
import scipy.cluster.hierarchy as sch 

import matplotlib.pyplot as plt
import plotly.graph_objects as go

import pandas as pd 
from geopy.geocoders import Nominatim 
from progressbar import ProgressBar 
import time 

### Hierarchical Agglomerative Clustering (HAC)

In [None]:

# Set Pandas options to display more columns
pd.options.display.max_columns=50

# Read in the weather data csv
df=pd.read_csv('weatherAUS.csv', encoding='utf-8')



#### Data Preprocessing

In [None]:
# Drop records where target RainTomorrow=NaN
df=df[pd.isnull(df['RainTomorrow'])==False]

# For other columns with missing values, fill them in with column mean
df=df.fillna(df.mean())

# Add spaces between multiple words in location names
df['Location2']=df['Location'].str.replace( r"([A-Z])", r" \1").str.strip()
# Update Location for Pearce RAAF so it can be found by geolocator
df['Location2']=df['Location2'].apply(lambda x: 'Pearce, Bullsbrook' if x=='Pearce R A A F' else x)

In [None]:
# Show a snaphsot of data
df.head(7)

In [None]:
df.dtypes

In [None]:
# Create a list of unique locations (cities)
loc_list=list(df.Location2.unique())

geolocator = Nominatim(user_agent="add-your-agent-name")
country ="Australia"
loc_res=[]

pbar=ProgressBar() # This will help us to show the progress of our iteration
for city in pbar(loc_list):
    loc = geolocator.geocode(city+','+ country)
    res = [city, loc.latitude, loc.longitude]
    loc_res = loc_res + [res]
    time.sleep(1) # sleep for 1 second before submitting the next query

# Add locations to a dataframe
df_loc=pd.DataFrame(loc_res, columns=['Loc', 'Latitude', 'Longitude'])

# Show data
df_loc

In [None]:
# Create a figure
fig = go.Figure(data=go.Scattergeo(
        lat=df_loc['Latitude'],
        lon=df_loc['Longitude'],
        hovertext=df_loc['Loc'], 
        mode = 'markers',
        marker_color = 'black',
        ))

# Update layout so we can zoom in on Australia
fig.update_layout(
        width=980,
        height=720,
        margin={"r":0,"t":10,"l":0,"b":10},
        geo = dict(
            scope='world',
            projection_type='miller',
            landcolor = "rgb(250, 250, 250)",
            center=dict(lat=-25.69839, lon=139.8813), # focus point
            projection_scale=6 # zoom in on
        ),
    )
fig.show()

### HAC Clustering - Dendogram

### Average Linkage

In [None]:

# Select attributes
X = df_loc[['Latitude', 'Longitude']]

# Create a figure
plt.figure(figsize=(16,9), dpi=300)

# Create linkage
Z = sch.linkage(X, method='average', optimal_ordering=True) # note we use method='average'

# Specify cluster colors
sch.set_link_color_palette(['red', 'blue', 'green', 'yellow'])

# Draw a dendrogram
sch.dendrogram(Z, leaf_rotation=90, leaf_font_size=10, labels=list(df_loc['Loc']), 
               color_threshold=14.55, above_threshold_color='black')
 
# Add horizontal line
plt.axhline(y=14.55, c='grey', lw=1, linestyle='dashed')

# Show the plot
plt.show()

### Ward Linkage

In [None]:
# Select attributes
X = df_loc[['Latitude', 'Longitude']]

# Create a figure
plt.figure(figsize=(16,9), dpi=300)

# Create linkage
Z = sch.linkage(X, method='ward', optimal_ordering=True) # note, we use method='ward'

# Specify cluster colors
sch.set_link_color_palette(['red', 'blue', 'green', 'yellow'])

# Draw a dendrogram
sch.dendrogram(Z, leaf_rotation=90, leaf_font_size=10, labels=list(df_loc['Loc']), 
               color_threshold=25, above_threshold_color='black')
 
# Add horizontal line
plt.axhline(y=25, c='grey', lw=1, linestyle='dashed')

# Show the plot
plt.show()

### HAC Clustering - Cluster the Dataset

In [None]:
# Set the model and its parameters
# note, options for linkage: {‘ward’, ‘complete’, ‘average’, ‘single’}, default=’ward’
modela4 = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='average')
modelw4 = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='ward')

# Fit HAC on our data
clusta4 = modela4.fit(X)
clustw4 = modelw4.fit(X)

# Attach cluster labels back to the location dataset
df_loc['AverageL Clustering']=clusta4.labels_
df_loc['WardsL Clustering']=clustw4.labels_

# Print data
df_loc

### Average Linkage

In [None]:
# Create a figure
fig = go.Figure(data=go.Scattergeo(
        lat=df_loc['Latitude'],
        lon=df_loc['Longitude'],
        hovertext=df_loc[['Loc', 'AverageL Clustering']], 
        mode = 'markers',
        marker=dict(colorscale=['blue', 'green', 'yellow', 'red']),
        marker_color = df_loc['AverageL Clustering'],
        ))

# Update layout so we can zoom in on Australia
fig.update_layout(
        showlegend=False,
        width=980,
        height=720,
        margin={"r":0,"t":10,"l":0,"b":10},
        geo = dict(
            scope='world',
            projection_type='miller',
            landcolor = "rgb(250, 250, 250)",
            center=dict(lat=-25.69839, lon=139.8813), # focus point
            projection_scale=6 # zoom in on
        ),
    )
fig.show()

### Ward Linkage

In [None]:
# Create a figure
fig = go.Figure(data=go.Scattergeo(
        lat=df_loc['Latitude'],
        lon=df_loc['Longitude'],
        hovertext=df_loc[['Loc', 'WardsL Clustering']], 
        mode = 'markers',
        marker=dict(colorscale=['yellow', 'green', 'blue', 'red']),
        marker_color = df_loc['WardsL Clustering'],
        ))

# Update layout so we can zoom in on Australia
fig.update_layout(
        showlegend=False,
        width=980,
        height=720,
        margin={"r":0,"t":10,"l":0,"b":10},
        geo = dict(
            scope='world',
            projection_type='miller',
            landcolor = "rgb(250, 250, 250)",
            center=dict(lat=-25.69839, lon=139.8813), # focus point
            projection_scale=6 # zoom in on
        ),
    )
fig.show()