In [1]:
import numpy as np
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags
import umap
import hdbscan
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from joblib import dump, load
from sklearn.cluster import dbscan
from sklearn.mixture import GaussianMixture
import os
import pandas as pd
import random
from tqdm import tqdm
import re
import csv
import string
import requests
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.cluster import AffinityPropagation
from sklearn.metrics.pairwise import cosine_distances

In [49]:
def umap_clustering(vectors):
    umap_model = umap.UMAP(n_neighbors=75,
                           n_components=5,
                           metric='cosine').fit(vectors) #model.docvecs.vectors_docs
    return umap_model

def num_clusters(vectors,mcs):
    dm = cosine_distances(vectors)
    cluster_ = hdbscan.HDBSCAN(min_cluster_size=mcs,
                                  metric='precomputed',
                                  cluster_selection_method='eom').fit(dm.astype(np.float64))
    cluster_ = len(set(cluster_.labels_)) - 1
    
    if cluster_ < 3:
        cluster_ = 3
    return cluster_

def gmm_clustering(doc2vec_model, umap_model,n_clusters):
    gmm = GaussianMixture(n_components=n_clusters)
    gmm.fit(umap_model.embedding_)
    cluster_gmm = gmm.predict_proba(umap_model.embedding_)
    return cluster_gmm

def get_topic_vectors(doc2vec_model, gmm_model, num_clusters):
    topic_vectors = np.vstack([doc2vec_model.docvecs.vectors_docs[[c for c,i in enumerate(gmm_model) if np.argmax(i) == x]].mean(axis=0)
                               for x in range(num_clusters)])
    return topic_vectors

def get_topic_words(doc2vec_model,topic_vectors_):
    topic_words_ = []
    topic_word_scores = []

    for tv in topic_vectors_:
        sim_words = doc2vec_model.wv.most_similar(positive=[tv], topn=50)
        topic_words_.append([word[0] for word in sim_words])
        topic_word_scores.append([round(word[1], 4) for word in sim_words])

    topic_words_ = np.array(topic_words_)
    topic_word_scores = np.array(topic_word_scores)
    return topic_words_,topic_word_scores

In [53]:
def Cluster(photo):
    print(photo)
    model = Doc2Vec.load(f'/media/ruben/FEF44259F44213F5/Users/Ruben/Documents/GitHub/iconic-images/analysis/topic-modelling/top2vec/models/doc2vec-{photo}-e75.model')
    model.init_sims(replace=False)
    umap_model = umap_clustering(model.docvecs.vectors_docs)
    mcs = round(len(model.docvecs.vectors_docs) / 100)
    if mcs < 15:
        mcs = 15
    n_cl = num_clusters(umap_model.embedding_,mcs)
    print(photo,f"minimal no. documents: {mcs}, number clusters: {n_cl}")
    gmm_model = gmm_clustering(model,umap_model,n_cl)
    topic_vectors_ = get_topic_vectors(model,gmm_model,n_cl)
    topic_words,topic_word_scores = get_topic_words(model,topic_vectors_)
    data = [[c," ".join(t[:100]),sum([x[c] for x in gmm_model])] for c,t in enumerate(topic_words)]
    data = pd.DataFrame(data,columns=['cluster','words','prominence'])
    data = data.sort_values('prominence',ascending=False)
#     for c,i in enumerate(data['words']): 
#         print(data['cluster'][c],i)
    return data

In [54]:
list_photos =[x.split('-')[1] for x in os.listdir('/media/ruben/FEF44259F44213F5/Users/Ruben/Documents/GitHub/iconic-images/analysis/topic-modelling/top2vec/models') if "data" in x]

In [55]:
for photo in list_photos:
    data = Cluster(photo)
    data.to_csv(f'/media/ruben/FEF44259F44213F5/Users/Ruben/Documents/GitHub/iconic-images/analysis/topic-modelling/top2vec/models/results-{photo}.csv',index=False)

AbuGhraib
AbuGhraib minimal no. documents: 15, number clusters: 9
AlanKurdi
AlanKurdi minimal no. documents: 46, number clusters: 20
Anasuma
Anasuma minimal no. documents: 15, number clusters: 12
Berlin
Berlin minimal no. documents: 124, number clusters: 17
Camp
Camp minimal no. documents: 40, number clusters: 13
ChildVulture
ChildVulture minimal no. documents: 33, number clusters: 15
FallingMan
FallingMan minimal no. documents: 38, number clusters: 14
Ghandi
Ghandi minimal no. documents: 36, number clusters: 17
IwoJima
IwoJima minimal no. documents: 159, number clusters: 13
KentState
KentState minimal no. documents: 34, number clusters: 13
ManMoon
ManMoon minimal no. documents: 138, number clusters: 4
Mao
Mao minimal no. documents: 15, number clusters: 5
Monk
Monk minimal no. documents: 50, number clusters: 14
NapalmGirl
NapalmGirl minimal no. documents: 98, number clusters: 15
Plane911
Plane911 minimal no. documents: 39, number clusters: 3
Hindenburg
Hindenburg minimal no. documents: