# Cluster analysis of SDMoA paintings

## Importing stuff

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from itertools import cycle #for plotting clusters
import seaborn as sns
sns.set()
from sklearn.cluster import AffinityPropagation, KMeans
from sklearn import metrics
from sklearn.manifold import TSNE 
from sklearn import preprocessing

In [None]:
#import matrix of average emotion scores for each painting as a pandas df
aveEmotionsMATLAB = pd.read_csv("aveEmotion.csv");
#aveEmotionsMATLAB.head(5)

In [None]:
#import all rating data
ratingData = np.empty((109, 10, 21))

for i in range(1,22):
    ratingData[:,:,i-1] = pd.read_excel("ratingData-sorted.xlsx", sheet_name=i, usecols=range(3,13))

In [None]:
#check to make sure the order of paintings is the same for all subjects
for i in range(0,21):
    print((ratingData[:,0,i] == ratingData[:,0,0]).all())
#ratingData[:,0,:]

## An exploration in normalization...

In [None]:
#histograms of scores for each emotion for sub01 (pre-normalizing)

fig, axes = plt.subplots(nrows=3, ncols=3, sharey = True, sharex = True, figsize=(12,10))

#plt.setp(axes, xticks=[1,2,3,4,5,6,7]) 
emotionList=["Moved","Fascin","Funny","Surprised","Indiff","Calm","Unset","Personal","Curious"]
fig.suptitle("Num Ratings per Emotion: Sub01", size=16)

dim=331
for i in range(1,10):
    plt.subplot(dim+(i-1), \
                      xlabel="Score", ylabel="Num Times Used",\
                      xticks=np.arange(1,8,1)
                     )
    plt.hist(ratingData[:,i,0], range=[1, 8], bins=7, align='left')
    plt.title(emotionList[i-1])
                  
plt.tight_layout(pad=3.0)
fig.subplots_adjust(top=0.88)

In [None]:
#histogram of average Moved score for sub02

fig, axes = plt.subplots(nrows=3, ncols=3, sharey = True, sharex = True, figsize=(12,10))

emotionList=["Moved","Fascin","Funny","Surprised","Indiff","Calm","Unset","Personal","Curious"]
fig.suptitle("Num Ratings per Emotion: Sub02", size=16)

dim=331
for i in range(1,10):
    plt.subplot(dim+(i-1), \
                      xlabel="Score", ylabel="Num Times Used",\
                      xticks=np.arange(1,8,1)
                     )
    plt.hist(ratingData[:,i,1], range=[1, 8], bins=7, align='left')
    plt.title(emotionList[i-1])
                  
plt.tight_layout(pad=3.0)
fig.subplots_adjust(top=0.88)

Different emotions have really different distributions, even within a subject, so I definitely agree that we should standardize per subject & emotion rather than just subject.

In [None]:
#standardize data per person per emotion (find mean, find sd, substract mean and divide by sd)
standardRating = np.copy(ratingData)

for i in range(0,20): #per subject
    for j in range(1,10): #per emotion
        # standardize across all paintings for this subject/emotion
        standardRating[:,j,i] = preprocessing.scale(ratingData[:,j,i])

In [None]:
#just making sure this^ did something
print(ratingData[0,:,0])
print(standardRating[0,:,0])

I also tried computing this without using the preprocessing module, just to double check it was doing the same thing and indeed it was! But there was an error: RuntimeWarning: invalid value encountered in double_scalars so I'm using the preprocessing. This was the other version:  

In [None]:
#standardRatingByHand = np.copy(ratingData)

#for i in range(0,20):
    #for j in range(0,9):
        #mean = np.nanmean(standardRatingByHand[:,j,i])
        #sd = np.nanstd(standardRatingByHand[:,j,i])
        #for k in range(0,109):
            #standardRatingByHand[k,j,i] = (standardRatingByHand[k,j,i]-mean)/sd

# np.allclose(standardRating,standardRatingByHand, equal_nan=True) --> this outputs True

In [None]:
#histogram of average scores for each emotin for sub01 - post-standarizing
fig, axes = plt.subplots(nrows=3, ncols=3, sharey = True, sharex = True, figsize=(12,10))

emotionList=["Moved","Fascin","Funny","Surprised","Indiff","Calm","Unset","Personal","Curious"]
fig.suptitle("Num Ratings per Emotion Standardized: Sub01", size=16)

dim=331
for i in range(1,10):
    #xmin, xmax = min(standardRating[:,i,0]), max(standardRating[:,i,0])
    plt.subplot(dim+(i-1), \
                      xlabel="Score", ylabel="Num Times Used",\
                    xticks=np.arange(-3, 4, 1)
                     )
    plt.hist(standardRating[:,i,0], bins=7, range=[-3.0, 4.0], align='left')
    plt.title(emotionList[i-1])
                  
plt.tight_layout(pad=3.0)
fig.subplots_adjust(top=0.88)

# for i in range(0,9):
#     plt.subplot(dim+i)
#     plt.hist(standardRating[:,i+1,0], bins=7)
#     plt.title(emotionList[i])
    
# plt.tight_layout()
# fig.subplots_adjust(top=0.88)



### Looks like this didn't change the distribution at all? Might be a visualization issue on my end though. I was having trouble with this graphs >:(

Robert: I don't think we would expect this to change the distribution, if we are still using the same number of bins (7) with equal spacing. The scores have changed though, are now seen in standard units. (0 is mean, +/- vals are multiples of std. dev.)

### : )

In [None]:
#This is what I did before - can ignore
#standardRating = ratingData

#for i in range(0,20):
    #standardRating[:,:,i] = preprocessing.scale(ratingData[:,:,i], axis=1)
    
#print(standardRating[:,:,20])

Recreating the matrix of averages

In [None]:
# means of each painting's emotion
aveEmotionsCopy = standardRating.copy()
aveEmotions = aveEmotionsCopy[:,:,1]

for i in range(0,109):
    for j in range(1,10):
        aveEmotions[i,j] = np.nanmean(aveEmotionsCopy[i,j,:])

In [None]:
# turing aveEmotions into a df
aECols = ['ID','Moved','Fascinated','Funny','Surprised','Indifferent','Calm','Unsettling','Personal','Curious']
aveEmotions = pd.DataFrame(data=aveEmotions, columns=aECols)
aveEmotions.head(10)

### Variability

In [None]:
# create matrix of variance scores for each painting
# not using this for anything - just wanted to see what it looked like. Overall, pretty uniform, with a few exceptions.
varsPerPainting = np.zeros(109)

for i in range(0,109):
    varsPerPainting[i] = np.nanvar(standardRating[i,1:10,:])
    
varsPerPainting = pd.DataFrame(data=varsPerPainting, columns=['Universal Var'])
varsPerPainting.head(10)

In [None]:
#create matrix of variance scores for each emotional dimension per painting
varsPerEmotionCopy = standardRating.copy()
varsPerEmotion = varsPerEmotionCopy[:,:,1]

for i in range(0,109):
    for j in range(1,10):
        varsPerEmotion[i,j] = np.nanvar(varsPerEmotionCopy[i,j,:])
        
varCols = ['ID','Moved Var','Fascin Var','Funny Var','Surp Var','Indiff Var','Unset Var','Calm Var','Personal Var','Curious Var']
varsPerEmotion = pd.DataFrame(data=varsPerEmotion, columns=varCols)
#adding the total variance because I can
varsPerEmotion = varsPerEmotion.join(varsPerPainting)
varsPerEmotion.head(10)

In [None]:
#create aveEmotions copy without ID column to be used for clustering

X = aveEmotions.drop(['ID'],axis=1)
X = X.to_numpy()

#X = np.delete(aveEmotions, 0, axis=1) # <-- can't remember what this does but it was there last time so.....
#print(X[:10]) # first ten rows

In [None]:
X

### Dimensional Reduction (t-SNE)

So we can plot our 9-dim data in 2-d.

Sydney: I set perplexity to 20, which is kind of an arbitrary number, since I think that's around the average number of paintings in each gallery....

Robert: I played around and settled on 30. but you're right the choice is arbitrary.

See Notes on t-SNE: [https://distill.pub/2016/misread-tsne/](https://distill.pub/2016/misread-tsne/)

In [None]:
# choice of perplexity is subjective, until you see a good layout/separation
# tsne = TSNE(n_components=2, perplexity = 20, random_state = 0) 

tsne = TSNE(n_components=2, perplexity = 30, random_state = 0) 

In [None]:
# tSNE performed on aveEmotions data

X_tsned = tsne.fit_transform(X)

plt.figure(figsize=(12, 10))
tsned = plt.scatter(X_tsned[:,0],X_tsned[:,1],c='r')
plt.title('t-SNEd dimensional reduction of painting scores')
plt.show()

__NOTE__: UMAP is another dimensional reduction technique we could explore. 

- https://umap-learn.readthedocs.io/en/latest/
- https://arxiv.org/abs/1802.03426

Would have to look into whether it is appropriate for our use case.

## Clustering time!

### shared clustering code and helper functions

In [None]:
af = AffinityPropagation()

ratings = np.array([1,2,3,4,5,6,7,8,9])

# function to get a table of the amount of paintings in each cluster
#whichCluster is a string
def amountInCluster(whichCluster,numClusters):
    clusterList = list(range(0,numClusters,1))
    clusterAmount = list(range(0,numClusters,1))

    for i in range(0,numClusters):
        clusterAmount[i] = aveEmotions[whichCluster][(aveEmotions[whichCluster] == i)].count()
    
    clusterDict = {'Cluster': clusterList, 'numItems': clusterAmount}
    clusterAmounts = pd.DataFrame.from_dict(clusterDict)
    
    return clusterAmounts

# function to get col means for each cluster

#whichCluster is a str
def addColAverage(whichCluster,clusterNum):  
    cluster = aveEmotions[(aveEmotions[whichCluster] == clusterNum)] #grabs paintings in a cluster
    means = cluster.mean() #col means
#     means = means.drop(labels = ["ID","ClusterAfter","ClusterBefore","ClusterKMeans"]) #drop Cluster labels
    means = means.drop(labels = ["ID","ClusterBefore"]) #drop Cluster labels
    means = means.to_numpy() #converts series to numpy array
    
    return means

### Clustering After t-SNE (this is what I originally had, and where the graphs I sent came from)

Robert: I'm going to comment this out, because we shouldn't use it!

In [None]:
# #Cluster! That! Data!
# clustering = af.fit(X_tsned) 

# cluster_centers_indices = af.cluster_centers_indices_
# labels = af.labels_

# n_clusters_ = len(cluster_centers_indices)

In [None]:
# # 8 clusters this time!

# print(cluster_centers_indices)
# print(labels)
# print(n_clusters_)

In [None]:
# #adding labels to a new aveEmotions df
# aveEmotions['ClusterAfter'] = list(labels)
# aveEmotions['ClusterBefore'] = np.nan 
# aveEmotions['ClusterKMeans'] = np.nan #this is empty for now so certain functions Work
# aveEmotions.head(5)

In [None]:
# # from sklearn's demo of affinity propogation

# plt.close('all')
# plt.figure(figsize=(8, 6))
# plt.clf()

# colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
# for k, col in zip(range(n_clusters_), colors):
#     class_members = labels == k
#     cluster_center = X_tsned[cluster_centers_indices[k]]
#     plt.plot(X_tsned[class_members, 0], X_tsned[class_members, 1], col + '.')
#     plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, 
#              markeredgecolor='k', markersize=14)
#     for x in X_tsned[class_members]:
#         plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

# plt.title('Estimated number of clusters: %d' % n_clusters_)
# plt.show()

In [None]:
# # function to get a table of the amount of paintings in each cluster

# #whichCluster is a string
# def amountInCluster(whichCluster,numClusters):
#     clusterList = list(range(0,numClusters,1))
#     clusterAmount = list(range(0,numClusters,1))

#     for i in range(0,numClusters):
#         clusterAmount[i] = aveEmotions[whichCluster][(aveEmotions[whichCluster] == i)].count()
    
#     clusterDict = {'Cluster': clusterList, 'numItems': clusterAmount}
#     clusterAmounts = pd.DataFrame.from_dict(clusterDict)
    
#     return clusterAmounts

In [None]:
# clusterCountAfter = amountInCluster('ClusterAfter',n_clusters_)
# clusterCountAfter

In [None]:
# # function to get col means for each cluster

# #whichCluster is a str
# def addColAverage(whichCluster,clusterNum):  
#     cluster = aveEmotions[(aveEmotions[whichCluster] == clusterNum)] #grabs paintings in a cluster
#     means = cluster.mean() #col means
#     means = means.drop(labels = ["ID","ClusterAfter","ClusterBefore","ClusterKMeans"]) #drop Cluster labels
#     means = means.to_numpy() #converts series to numpy array
    
#     return means

In [None]:
# #this will be the x vals for plotting
# ratings = np.array([1,2,3,4,5,6,7,8,9])

# #this checks to make sure addColAverage works
# #print(type(addColAverage("ClusterAfter",0)))
# print(addColAverage("ClusterAfter",0))

In [None]:
# # plot the emotion averages of each cluster

# fig, axes= plt.subplots(nrows=2, ncols=4, sharey = True, figsize=(16,6))

# plt.setp(axes, xticks=[1,2,3,4,5,6,7,8,9]) #, xticklabels=["Moved","Fascin","Funny","Surprised",
#                                                         #"Indiff","Calm","Unset","Personal","Curious"]) #adds tick marks for each emotion category
# k = 0
# for i in range(0,2):
#     for j in range(0,4):
#         axes[i,j].plot(ratings, addColAverage("ClusterAfter",k))
#         axes[i,j].set_title('Cluster {number}'.format(number = k))
#         k = k+1

# plt.tight_layout()

# plt.show()

### Clustering Before t-SNE

Robert: This is what we want!

In [None]:
#fitting X (9-dim) instead of transformed X_tsned
clustering2 = af.fit(X) 
                             
cluster_centers_indices2 = af.cluster_centers_indices_
labels2 = af.labels_
n_clusters_2 = len(cluster_centers_indices2)

aveEmotions['ClusterBefore'] = labels2

In [None]:
aveEmotions.head(5)

In [None]:
#11 clusters now

print(cluster_centers_indices2)
print(labels2)
print(n_clusters_2)

I'm not sure I'm doing what you want me to do here...I'm trying to use the transformed data (which should be the same as it was above) to plot the clusters you get when considering all 9 dimensions. The cluster look a little weird, though.

In [None]:
# using X_tsned (transformed data) instead of X

plt.close('all')
plt.figure(figsize=(12, 10))
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_2), colors):
    class_members = labels2 == k
    cluster_center = X_tsned[cluster_centers_indices2[k]]
    plt.plot(X_tsned[class_members, 0], X_tsned[class_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
    for x in X_tsned[class_members]:
        plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col )

plt.title('Estimated number of clusters: %d' % n_clusters_2)
plt.show()

#fig.savefig('cluster-graph.png') <--this......does not work :( i can't figure out where to place "fig" w/o ruining the graph

In [None]:
clusterCountBefore = amountInCluster('ClusterBefore',n_clusters_2)
clusterCountBefore.numItems[0]

In [None]:
plt.plot(ratings, np.array(addColAverage("ClusterBefore",0)))
# print(type(addColAverage("ClusterBefore",0)))

In [None]:
# plot the emotion averages of each cluster

fig, axes= plt.subplots(nrows=6, ncols=2, sharey = True, figsize=(12,18))

plt.setp(axes, xticks=[1,2,3,4,5,6,7,8,9], \
         xticklabels=["Moved","Fascin","Funny","Surprised", "Indiff","Calm","Unset","Personal","Curious"], \
#         rotation=45
        ) #adds tick marks for each emotion category
fig.suptitle("Average Emotion Rating per Cluster (Std Units)", size=16)

k = 0
for i in range(0,6):
    for j in range(0,2):
        if k<n_clusters_2:
            axes[i,j].plot(ratings, addColAverage("ClusterBefore",k))
            axes[i,j].set_title('Cluster {number} (n={amount})'.format(number = k, amount=clusterCountBefore.numItems[k]))
            k = k+1

plt.tight_layout(pad=3.0)
fig.subplots_adjust(top=0.95)

plt.show()

fig.savefig('cluster-averages-linegraph.png')

I'm putting all the stuff I had here about combining clusters in another document - I don't think we'll need it anymore haha.

### Clustering Before t-SNE (KMeans)

Robert: not used here

In [None]:
# #fitting X (9-dim) instead of transformed X_tsned
# n_clusters_km = 13
# km = KMeans(n_clusters=n_clusters_km)

# clusteringkm = km.fit(X) 
                             
# labelskm = km.labels_

# aveEmotions['ClusterKMeans'] = labelskm

In [None]:
# # using X_tsned (transformed data) instead of X

# plt.close('all')
# plt.figure(figsize=(10, 8))
# plt.clf()

# colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
# for k, col in zip(range(n_clusters_km), colors):
#     class_members = labelskm == k
#     plt.plot(X_tsned[class_members, 0], X_tsned[class_members, 1], col + '.')

# plt.title('Estimated number of clusters: %d' % n_clusters_km)
# plt.show()

# Combining everything together!

In [None]:
## importing all rating data to recover painting names/galleries

names = pd.read_excel("ratingData-sorted.xlsx", sheet_name=1, usecols=range(0,4))
names.rename(columns = {"Unique ID":"ID"}, inplace = True)
names

In [None]:
#everything at once
labeledRatings = names.merge(aveEmotions, on=['ID'])
# labeledRatings = labeledRatings.drop(['ClusterAfter', 'ClusterKMeans'], axis=1)
labeledRatings = labeledRatings.merge(varsPerEmotion, on=['ID'])
labeledRatings.head()


labeledRatings.to_excel('labeledRatings.xlsx')

In [None]:
labeledRatings[(labeledRatings['ClusterBefore'] == 8)]

In [None]:
#given a cluster, creates a list of the number of paintings in each gallery
galleryList = list(["1","2","3","100","16","19","200"])
galleries = [1,2,3,100,16,19,200]

def numInGallery(clusterNum):

    numIn = np.zeros(7)
    
    sampleCluster = labeledRatings.ClusterBefore == clusterNum
    
    for i in range(0,7):
        sampleGal = labeledRatings.Gallery==galleries[i]
        numIn[i]= len(labeledRatings[sampleCluster & sampleGal])
    
    numIn = list(numIn)
    
    return numIn

In [None]:
# for each cluster, plot num paintings in each gallery 

fig, axes= plt.subplots(nrows=3, ncols=4, sharey = True, figsize=(16,12))  #[1,2,3,100,16,19,200]

#plt.setp(axes, xticks=[1,2,3,3,4,5,6,7]) #, yticks=[1,2,3,4,5,6,7,8]) #, xticklabels=["Moved","Fascin","Funny","Surprised",
                                                        #"Indiff","Calm","Unset","Personal","Curious"]) #adds tick marks for each emotion category
fig.suptitle("Num of Paintings in each Gal Per Cluster", size=16)
numGals = [1,2,3,4,5,6,7]

k = 0
for i in range(0,3):
    for j in range(0,4):
        if k<n_clusters_2:
            axes[i,j].bar(numGals, numInGallery(k))
            axes[i,j].set_title('Cluster {number}'.format(number = k))
            axes[i,j].set_xticks([1,2,3,4,5,6,7])
            axes[i,j].set_xticklabels(galleryList)
            if j==0:
                axes[i,j].set_ylabel("Num of Paintings in Gallery")
                if i==2:
                    axes[i,j].set_xlabel("Gallery Number")
            k = k+1

plt.tight_layout(pad=3.0)
fig.subplots_adjust(top=0.88)
fig.savefig('num-paintings-in-gal.png')

plt.show()

In [None]:
# Highest Scores

#most moving
#print(labeledRatings.iloc[(labeledRatings['Moved'].idxmax())])

#most fascinating
#print(labeledRatings.iloc[(labeledRatings['Fascinated'].idxmax())])

#most funny
#print(labeledRatings.iloc[(labeledRatings['Funny'].idxmax())])

#most surprised
#print(labeledRatings.iloc[(labeledRatings['Surprised'].idxmax())])

#most indifferent
#print(labeledRatings.iloc[(labeledRatings['Indifferent'].idxmax())])

#most calm
#print(labeledRatings.iloc[(labeledRatings['Calm'].idxmax())])

#most unsettling
#print(labeledRatings.iloc[(labeledRatings['Unsettling'].idxmax())])

#most personal
#print(labeledRatings.iloc[(labeledRatings['Personal'].idxmax())])

#most curious
#print(labeledRatings.iloc[(labeledRatings['Curious'].idxmax())])

In [None]:
#create 2 dfs: variability, ave score, and diff. sort on diff

#Fascination
fascination = labeledRatings.loc[:,['Title','Artist','Gallery','ID','Fascinated', 'Fascin Var']]
fascination['Diff'] = fascination['Fascinated'] - fascination['Fascin Var']
fascination = fascination.sort_values(by=['Diff'], ascending=False)

#Indifference
indiff = labeledRatings.loc[:,['Title','Artist','Gallery','ID','Indifferent', 'Indiff Var']]
indiff['Diff'] = indiff['Indifferent'] - indiff['Indiff Var']
indiff = indiff.sort_values(by=['Indifferent'], ascending=False) ##sorting by rating

#Unsettling
unsettled = labeledRatings.loc[:,['Title','Artist','Gallery','ID','Unsettling', 'Unset Var']]
unsettled['Diff'] = unsettled['Unsettling'] - unsettled['Unset Var']
unsettled = unsettled.sort_values(by=['Unsettling'], ascending=False) ##sorting by rating

In [None]:
#unsettled & Indifferent --> top 20 that have variance less than 2

#labeledRatings[(labeledRatings['ClusterBefore'] == 4)]
unsettledSmall = unsettled[(unsettled['Unset Var']<2)]
unsettledSmall = unsettledSmall.head(n=20)

indiffSmall = indiff[(indiff['Indiff Var']<2)]
indiffSmall = indiffSmall.head(n=20)

fascinationSmall = fascination.head(n=20)

In [None]:
unsettledSmall

In [None]:
#['Title','Artist','Gallery','ID']
paintingList = pd.concat([fascinationSmall, unsettledSmall, indiffSmall], sort=False)
paintingList.to_excel('paintingList.xlsx')