In [1]:
import os, glob, random, re, string, langid
import pandas as pd
from func_cluster import *
import json
import numpy as np
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags
import umap
import hdbscan
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from joblib import dump, load
from sklearn.cluster import dbscan
from sklearn.mixture import GaussianMixture
from tqdm import tqdm
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.cluster import AffinityPropagation
from sklearn.metrics.pairwise import cosine_distances

In [13]:
## Functions
def umap_clustering(vectors):
    umap_model = umap.UMAP(n_neighbors=75,
                           n_components=5,
                           metric='cosine').fit(vectors) #model.docvecs.vectors_docs
    return umap_model

def gmm_clustering(umap_emb,n_clusters):
    gmm = GaussianMixture(n_components=n_clusters,random_state=17)
    gmm.fit(umap_emb)
    cluster_gmm = gmm.predict_proba(umap_emb)
    return cluster_gmm

def get_topic_vectors(doc2vec_model, gmm_model, num_clusters):
    topic_vectors = np.vstack([doc2vec_model.docvecs.vectors_docs[[c for c,i in enumerate(gmm_model) if np.argmax(i) == x]].mean(axis=0)
                               for x in range(num_clusters)])
    return topic_vectors

def get_topic_words(doc2vec_model,topic_vectors_):
    topic_words_ = []
    topic_word_scores = []

    for tv in topic_vectors_:
        sim_words = doc2vec_model.wv.most_similar(positive=[tv], topn=50)
        topic_words_.append([word[0] for word in sim_words])
        topic_word_scores.append([round(word[1], 4) for word in sim_words])

    topic_words_ = np.array(topic_words_)
    topic_word_scores = np.array(topic_word_scores)
    return topic_words_,topic_word_scores

In [51]:
photos = [os.path.split(x)[-1].split('-')[1] for x in glob.glob('C:/Users/ruben.ros/Documents/REACT/data/embeddings/doc2vec/*.model')]

In [12]:
# for photo in photos:
#     model = Doc2Vec.load(f'D:/react-data/iconic/models/doc2vec-{photo}-e75.model')
#     model.init_sims(replace=False)
#     umap_model = umap_clustering(model.docvecs.vectors_docs)
#     np.savetxt(f'C:/Users/ruben.ros/Documents/REACT/data/doc2vec/models/{photo}-umap-embeddings.csv', umap_model.embedding_, delimiter=',')

In [52]:
for photo in photos:
    ## Determine number of clusters
    num_clus = pd.read_csv(f'C:/Users/ruben.ros/Documents/REACT/data/evaluation/results-{photo}-bicaic-eval.csv')
    num_clus = len(num_clus['cluster'])
    print(num_clus, "clusters in ", photo)
    
    ## Load UMAP embeddings
    umap_emb = np.loadtxt(f'C:/Users/ruben.ros/Documents/REACT/data/embeddings/umap/{photo}-umap-embeddings.csv', delimiter=',')
    
    ## GMM Clustering
    gmm_model = gmm_clustering(umap_emb,num_clus)
    
    ## Load Doc2Vec
    model = Doc2Vec.load(f'C:/Users/ruben.ros/Documents/REACT/data/embeddings/doc2vec/doc2vec-{photo}-e75.model')
    model.init_sims(replace=False)
    
    ## Get Topic Vectors and Top Words
    topic_vectors_ = get_topic_vectors(model,gmm_model,num_clus)
    topic_words,topic_word_scores = get_topic_words(model,topic_vectors_)
    
    ## Export Cluster Distributions
    metadata = pd.read_csv(f'C:/Users/ruben.ros/Documents/REACT/data/embeddings/metadata/{photo}-metadata.csv')
    metadata = {row[0]:[row[1],row[2],row[3]] for ind,row in metadata.iterrows()}
    
    d = []
    for index_,x in enumerate(gmm_model):
        if int(index_) in list(metadata.keys()) and metadata[int(index_)][1] != "nan":
            id_ = metadata[int(index_)][0]
            year = metadata[int(index_)][1]
            d.append([id_] + [year] + x.tolist())
        else:
            id_year = "nan"
            id_ = "nan"
            year = "nan"
            d.append([id_] + [year] + x.tolist())
    data = pd.DataFrame(d,columns=["id","year"] + ["c" + str(x) for x in range(0,num_clus)])
    data = data[data['id'] != "nan"]
    data = data[data['year'] != "nan"]
    data = data.reset_index(drop=True)
    data['year'] = data['year'].astype(int)
    topwords = pd.DataFrame([" ".join(w) for w in topic_words],columns=['topwords'])
    topwords = {"c"+str(c):" ".join(i.split(' ')[:num_clus]) for c,i in enumerate(topwords['topwords'])}
    data.columns = ["id","year"] + [topwords[c] for c in list(data.columns)[2:]]

    d = pd.DataFrame()

    for year in range(1995,2020):
        ss = data[data['year'] == year]
        if len(ss) == 0:
            continue
        sum_d = []
        for i in range(0,num_clus):
            sum_ = ss[topwords['c' + str(i)]].sum()
            sum_d.append(sum_)
        d[str(year)] = sum_d
        
    for year in d.columns:
        d[year] = [i / d[year].sum() for i in d[year]]
        
    d = d.T
    d = d.reset_index()
    d.columns = ['year'] + [topwords['c' + str(c)] for c in list(d.columns)[1:]]
    d.to_csv(f'C:/Users/ruben.ros/Documents/REACT/data/results/diachronic-plots/{photo}-data.csv',index=False)
    
    ## Generate Top Words
    twdf = pd.DataFrame(topic_words).T
    twdf.columns = [f"cluster_{x}" for x in list(range(num_clus + 1))[1:]]
    twdf.to_csv(f'C:/Users/ruben.ros/Documents/REACT/data/results/top-words/{photo}-data.csv',index=False)

13 clusters in  AbuGhraib
12 clusters in  AlanKurdi
8 clusters in  Anasuma
19 clusters in  Berlin
13 clusters in  Camp
17 clusters in  Che
16 clusters in  ChildVulture
4 clusters in  ChileCoup
13 clusters in  FallingMan
13 clusters in  Ghandi
19 clusters in  Hindenburg
10 clusters in  IwoJima
14 clusters in  KentState
16 clusters in  ManMoon
3 clusters in  Mao
23 clusters in  MigrantMother
19 clusters in  Monk
15 clusters in  NapalmGirl
13 clusters in  Plane911
4 clusters in  Rwanda
10 clusters in  SharbatGula
9 clusters in  SpanishSoldier
14 clusters in  TankMan
15 clusters in  TimesSquareKiss
8 clusters in  VietCong
7 clusters in  WarRoom
