## <font color='green'> 1. K-means Clustering on Covid Data
* Clustering Algorithms in python: https://scikit-learn.org/stable/modules/clustering.html

In [None]:
import os
os.chdir('/Users/hj020/Desktop/2022/EconomicAnalytics-master/Python_/Data')

import numpy as np
import pandas as pd
import math

raw0 = pd.read_csv('covid.csv')
raw0.head()
# Covid data
# positivelast7per1k: # of new positive cases per 1,000 people during Oct 24,2020 - Nov 02,2020
# testpositivitylast7: percent of new Covid tests that were positive during 1024, 2020 - 1102,2020

In [None]:
X = raw0.iloc[:,1::]

### <font color='green'> i) K-means for Three Clusters

In [None]:
from sklearn.cluster import KMeans
KMres = KMeans(n_clusters=3, random_state=0).fit(X)
y_pred = KMres.predict(X)

In [None]:
y_pred 

In [None]:
# Visualization
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 10))
plt.scatter(X.iloc[:,0], X.iloc[:,1], c=y_pred) 

for i, txt in enumerate(raw0.iloc[:, 0]):
    plt.annotate(txt, (X.iloc[i,0], X.iloc[i,1]),size=14)
    
plt.xlabel('Positive Cases per 1000 People in the last 7 days ',fontsize = 15)
plt.ylabel('Positivity Rate in the last 7 days',fontsize = 15)
plt.title('Covid Status by State',fontsize = 15)
    
plt.show()

### <font color='green'> ii) K-means for different numbers of clusters

In [None]:
plt.figure(figsize=(18, 18))

for i in range(1,5):
    plt.subplot(2,2,i)
    KMres = KMeans(n_clusters=i+1, random_state=0).fit(X)
    y_pred = KMres.predict(X)
    
    plt.scatter(X.iloc[:,0], X.iloc[:,1], c=y_pred)
    plt.title("# of Clusters = %d" %(i+1))
    
    for i, txt in enumerate(raw0.iloc[:, 0]):
        plt.annotate(txt, (X.iloc[i,0], X.iloc[i,1]),size=10)
        
plt.show()

## <font color='green'> 2. Hierarchical Clustering on Cancer Cell line data (NCI60) 
* https://scikit-learn.org/stable/modules/clustering.html#hierarchical-clustering
* An example: https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/

In [None]:
raw = pd.read_csv('NCI60.csv') # 6,830 gene expression measurements on 64 cancer cell lines
print(raw.shape)
print(raw.head())

In [None]:
# We will ignore the cancer types in the last column. 
# After performing clustering, we will check the extent to which these cancer types agree with the results

X = raw.iloc[:,1:-1]
Label = raw['labs'].values.tolist() # get a "list" of the names for the 64 cancer cell lines (np array or pd frame don't work)

### <font color='green'> i) Draw a Dendrogram
* Linkage functions:https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
* Available linkage functions: single, complete, average, weighted, centroid, median,and ward

In [None]:
# dendrogram
from scipy.cluster import hierarchy

Z = hierarchy.linkage(X, 'complete') 

plt.figure(figsize=(25, 10))
plt.xlabel('Cancer types',size=12)
hierarchy.dendrogram(Z,
                     leaf_rotation=90,  # rotate the labels on X-axis
                     leaf_font_size=12,
                     labels=Label)
                     
plt.title('Hierarchical Clustering on NCI60', size=20)           
plt.show()

### <font color='green'> ii) Truncate the dendrogram to determine clusters

In [None]:
plt.figure(figsize=(25, 10))
plt.xlabel('Cancer types',size=12)
hierarchy.dendrogram(Z,
                     truncate_mode='level', # The other option: 'lastp' - show only p branches
                     p = 5, # number of clusters
                     leaf_rotation=90,
                     leaf_font_size=12,
                     labels=Label,
                     )
                     
plt.title('Hierarchical Clustering on NCI60 with five clusters', size=20)          
plt.show()

### <font color='darkred'> HW9
* Display six dendrograms that are created on the Covid data using single, complete, average, weighted, centroid and ward linkage functions in a figure with 3 by 2 subplots