![](https://i.ytimg.com/vi/4b5d3muPQmA/maxresdefault.jpg)m.youtube.com

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import scipy.cluster.hierarchy as hcluster
from sklearn.cluster import AgglomerativeClustering


#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/pfizer-covid19-vaccine-distribution-allocations/COVID-19_Vaccine_Distribution_Allocations_by_Jurisdiction_-_Pfizer.csv', encoding='utf8')
df.head()

#Codes by Diksha Bhati https://www.kaggle.com/dikshabhati2002/k-means-and-hierarchical-clustering/notebook

In [None]:
#drop country column
data=df.drop(['Jurisdiction'],axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder

#fill in mean for floats
for c in data.columns:
    if data[c].dtype=='float16' or  data[c].dtype=='float32' or  data[c].dtype=='float64':
        data[c].fillna(data[c].mean())

#fill in -999 for categoricals
data = data.fillna(-999)
# Label Encoding
for f in data.columns:
    if data[f].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(data[f].values))
        data[f] = lbl.transform(list(data[f].values))
        
print('Labelling done.')

In [None]:
data = pd.get_dummies(data)

In [None]:
data.head()

#Scaling

In [None]:
#scaling data
scaling=StandardScaler()
scaled=scaling.fit_transform(data)

In [None]:
scaled_df=pd.DataFrame(scaled,columns=data.columns)

# princt scaled dataset
scaled_df.head()

In [None]:
# plot elbow curve

a=[]
K=range(1,10)
for i in K:
    kmean=KMeans(n_clusters=i)
    kmean.fit(data)
    a.append(kmean.inertia_)
    
plt.plot(K,a,marker='o')
plt.title('Elbow Method',fontsize=15)
plt.xlabel('Number of clusters',fontsize=15)
plt.ylabel('Sum of Squared distance',fontsize=15)
plt.show()

In [None]:
#chosing no. of clusters as 4 and refitting kmeans model
kmeans = KMeans(n_clusters = 4,random_state = 111)
kmeans.fit(scaled_df)

In [None]:
#count number of records in every cluster
pd.Series(kmeans.labels_).value_counts()

#Silhouette Coefficient

In [None]:
#calculate how good our model is
#calculate Silhouette Coefficient for K=4

metrics.silhouette_score(scaled_df, kmeans.labels_)

#Prediction

In [None]:
#predicting values
cluster_labels = kmeans.fit_predict(scaled_df)

In [None]:
preds = kmeans.labels_
kmeans_df = pd.DataFrame(df)
kmeans_df['KMeans_Clusters'] = preds
kmeans_df.head(10)

In [None]:
#save a kmeans file
kmeans_df.to_csv('kmeans_result.csv',index=False)

#Visualization of clusters

In [None]:
#visualization of clusters Diabetes Mellitus vs BMI
sns.scatterplot(kmeans_df['Jurisdiction'],kmeans_df['Total Allocation Pfizer "Second Dose" Shipments'],hue='KMeans_Clusters',data=kmeans_df) 
plt.title("Pfizer Vaccination Shipments by Jurisdiction", fontsize=15)
plt.xlabel("Jurisdiction", fontsize=12)
plt.ylabel('Total Allocation Pfizer "Second Dose" Shipments', fontsize=12)
plt.xticks(rotation=45)
plt.show()

In [None]:
#visualization of clusters Diabetes Mellitus vs BMI
sns.scatterplot(kmeans_df['HHS Region'],kmeans_df['Total Allocation Pfizer "Second Dose" Shipments'],hue='KMeans_Clusters',data=kmeans_df) 
plt.title("Pfizer Vaccination Shipments by Region", fontsize=15)
plt.xlabel("HHS Region", fontsize=12)
plt.ylabel('Total Allocation Pfizer "Second Dose" Shipments', fontsize=12)
plt.xticks(rotation=45)
plt.show()

In [None]:
#Find number of Vaccinated
vaccinated=kmeans_df[kmeans_df['KMeans_Clusters']==0]['Jurisdiction']
vaccinated1=kmeans_df[kmeans_df['KMeans_Clusters']==1]['Jurisdiction']
vaccinated2=kmeans_df[kmeans_df['KMeans_Clusters']==2]['Jurisdiction']
vaccinated3=kmeans_df[kmeans_df['KMeans_Clusters']==3]['Jurisdiction']

print("Number of vaccinated",len(vaccinated))
print("Number of vaccinated1",len(vaccinated1))
print("Number of vaccinated2",len(vaccinated2))
print("Number of vaccinated3",len(vaccinated3))

In [None]:
#list of Vaccinated3 by Jurisdiction
list(vaccinated3)

In [None]:
#list of Vaccinated2 by Jurisdiction
list(vaccinated2)

In [None]:
#list of Vaccinated1 by Jurisdiction
list(vaccinated1)

In [None]:
#list of Vaccinated by Jurisdiction
list(vaccinated)

In [None]:
for i in vaccinated2:
    if i == 'Hawaii':
        print('Yes', i , 'is present in vaccinated2 list')

In [None]:
for i in vaccinated3:
    if i == 'Hawaii':
        print('Yes', i , 'is present in vaccinated3 list')
else:    
        print('No', i, 'is not present in vaccinated3 list')

<div class="alert alert-block alert-success">
     Dendogram

A Dendrogram is a type of tree diagram showing hierarchical relationships between different sets of data.

</div>


In [None]:
#plotting dendogram
plt.figure(figsize=(50, 12))
dend=hcluster.dendrogram(hcluster.linkage(scaled_df,method='ward'))

<div class="alert alert-block alert-success">
     
Hierarchical Clustering

Also called Hierarchical cluster analysis or HCA is an unsupervised clustering algorithm which involves creating clusters that have predominant ordering from top to bottom. This clustering technique is divided into two types: Agglomerative Hierarchical Clustering and Divisive Hierarchical Clustering. 

</div>

In [None]:
# Getting labels from Agglomearative Hierarchical clustering
hcluster = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='ward')  
hcluster.fit_predict(scaled_df)
hcluster_label = hcluster.labels_

In [None]:
hcluster_df = pd.DataFrame(df)
#adding hcluster labels in hcluster_df
hcluster_df['hcluster'] = hcluster_label
#first few rows of hcluster_df
hcluster_df.head()

In [None]:
#visualizing hcluster results
#child mortality vs exports 
sns.scatterplot(hcluster_df['Jurisdiction'],hcluster_df['Total Pfizer Allocation "First Dose" Shipments'],hue='hcluster',data=hcluster_df)
plt.title("Pfizer Allocations Shipments by Jurisdiction", fontsize=15)
plt.xlabel("Jurisdiction", fontsize=12)
plt.ylabel('Total Allocation Pfizer "First Dose" Shipments', fontsize=12)
plt.xticks(rotation=45)
plt.show()

In [None]:
#visualizing hcluster results
#child mortality vs exports 
sns.scatterplot(hcluster_df['HHS Region'],hcluster_df['Total Pfizer Allocation "First Dose" Shipments'],hue='hcluster',data=hcluster_df)
plt.title("Pfizer Allocations Shipments by Jurisdiction", fontsize=15)
plt.xlabel("HHS Region", fontsize=12)
plt.ylabel('Total Allocation Pfizer "First Dose" Shipments', fontsize=12)
plt.xticks(rotation=45)
plt.show()

In [None]:
#Find number of vaccinated
vaccinated3=hcluster_df[hcluster_df['hcluster']==0]['Jurisdiction']
vaccinated2=hcluster_df[hcluster_df['hcluster']==1]['Jurisdiction']
vaccinated1=hcluster_df[hcluster_df['hcluster']==2]['Jurisdiction']
vaccinated=hcluster_df[hcluster_df['hcluster']==3]['Jurisdiction']

print("Number of vaccinated3",len(vaccinated3))
print("Number of vaccinated2",len(vaccinated2))
print("Number of vaccinated1",len(vaccinated1))
print("Number of vaccinated",len(vaccinated))

In [None]:
#Code by Olga Belitskaya https://www.kaggle.com/olgabelitskaya/sequential-data/comments
from IPython.display import display,HTML
c1,c2,f1,f2,fs1,fs2=\
'#eb3434','#eb3446','Akronim','Smokum',30,15
def dhtml(string,fontcolor=c1,font=f1,fontsize=fs1):
    display(HTML("""<style>
    @import 'https://fonts.googleapis.com/css?family="""\
    +font+"""&effect=3d-float';</style>
    <h1 class='font-effect-3d-float' style='font-family:"""+\
    font+"""; color:"""+fontcolor+"""; font-size:"""+\
    str(fontsize)+"""px;'>%s</h1>"""%string))
    
    
dhtml('Be patient. Mar√≠lia Prata, @mpwolke was Here' )