In [23]:
from bertopic import BERTopic

import os
import numpy as np
import pandas as pd
import string

import umap
from nltk.corpus import stopwords

from sklearn.mixture import GaussianMixture
from sklearn.feature_extraction.text import CountVectorizer

from ctfidf import CTFIDFVectorizer

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [18]:
stopwords_de = stopwords.words('german')
stopwords_en = stopwords.words('english')

with open(os.getcwd()+'/../../conf/stopwords_de.txt', 'r') as f:
    stopwords_de_extra = f.read().splitlines()

stopwords_full = []
stopwords_full.extend(stopwords_de)
stopwords_full.extend(stopwords_en)
stopwords_full.extend(stopwords_de_extra)

stopwords_full = [word.lower() for word in stopwords_full]

stop_all = set(stopwords_full + list(string.punctuation))

In [35]:
final_df = pd.read_pickle(os.getcwd()+'/../sqlite_db/dataframes/robotik_old/final_df.pkl')
final_df.head(2)

Unnamed: 0,id,query,gt_rank,text,lang,title,pubDate,label,text_len,doc_vec,noun_chunks,nounchunk_mean_vec,keywords,keyword_mean_vec,mean_sim_summ,adjectives,verbs
0,210705_news_107059,robotik,3,"""Die Menschen möchten künftig nicht einfach nu...",de,Accenture: Unternehmen müssen sich auf nutzerz...,2020-02-18 11:16:00,1,503,"[[0.0050459006, 0.0023063626, 0.04180066, 0.00...","[der Mensch, nur mehr Technologie, sie, Techno...","[-0.017144503, -0.007727461, -0.012556386, 0.0...","[(mit innovation, 0.4053), (darq technologien,...","[-0.03808231, 0.011605878, 0.00039789037, -0.0...",0.06239,"[veröffentlicht, maßgebend, groß, hergebracht,...","[beschreiben, stellen, rücken, abstimmen, müss..."
1,210705_news_116431,robotik,2,Der Düsseldorfer Rüstungs- und Automotive-Konz...,de,Australien Rheinmetall forscht an automatisier...,2020-02-27 08:08:09,2,596,"[[-0.030772485, -0.012720297, 0.058562107, 0.0...",[der Düsseldorfer Rüstung und Automotive-Konze...,"[-0.014299056, -0.00045433754, -0.0038481976, ...","[(erstes australisches, 0.3794), (automatisier...","[-0.01195763, 0.009942288, 0.012103766, -0.001...",0.072432,"[Düsseldorfer, eigen, erster, australisch, aus...","[initiieren, zusammenarbeiten, schreiben, ents..."


In [36]:
final_df.gt_rank.value_counts()

2    13
1     9
3     7
Name: gt_rank, dtype: int64

In [21]:
def remove_stop_words(text):
    
    word_list = []
    text = text.lower()
    
    for word_token in text.split():
        if word_token not in stop_all:
            word_list.append(word_token)
            
    return ' '.join(word_list)

In [39]:
final_df['text_clean'] = final_df.apply(lambda x:remove_stop_words(x['text']), axis=1)

In [40]:
final_df['dummy'] = 1

In [24]:
def get_umap_output(vec_array, dim_size=5):
    
    umap_obj = umap.UMAP(n_neighbors=40, 
                        n_components=dim_size, 
                        min_dist=0.01,
                        metric='cosine',
                        random_state=123).fit(vec_array) 
    
    umap_output = umap_obj.transform(vec_array) 
    return umap_output, umap_obj

def get_gmm_output(data_points, cluster_size=7):
    
    gmm_output = GaussianMixture(n_components=cluster_size, covariance_type='full', random_state=12).fit(data_points)
    
    return gmm_output

In [42]:
def get_topwords_per_class(df, topn=10):
    
    docs = pd.DataFrame({'Document': df.text_clean, 'Class': df.cluster_id})
    docs_per_class = docs.groupby(['Class'], as_index=False).agg({'Document': ' '.join})

    # Create bag of words
    count_vectorizer = CountVectorizer().fit(docs_per_class.Document)
    count = count_vectorizer.transform(docs_per_class.Document)
    words = count_vectorizer.get_feature_names()

    # Extract top 10 words
    ctfidf = CTFIDFVectorizer().fit_transform(count, n_samples=len(docs)).toarray()
    words_per_class = {label: [words[index] for index in ctfidf[label].argsort()[-topn:]] for label in docs_per_class.Class}
    
    return words_per_class

def get_analysis_df(words_per_class, df, label_class):
    
    df_top10 = pd.DataFrame(words_per_class).T
    df_top10['cluster_id'] = df_top10.index

    df_map = pd.pivot_table(df, columns=label_class, index=['cluster_id'], aggfunc='count', values='dummy').reset_index()
    # df_map.drop('doc_label', axis=1, inplace=True)
    df_map.rename(columns={0: 'wrong', 1: 'irrelevant', 2:'partially relevant', 3:'relevant'}, inplace=True)

    final_df = df_map.merge(df_top10, on='cluster_id', how='inner')
    final_df.fillna(0, inplace=True)
#     final_df['total'] = final_df.apply(lambda x:x['neg']+x['pos'],axis=1)
#     final_df['pos_ratio'] = final_df.apply(lambda x:x['pos']/x['total'],axis=1)
#     final_df.sort_values('pos', ascending=False, inplace=True)
    
#     clustering_score = 0
#     pos_ratio_limit = 0.3
    
#     for pos, pos_ratio in zip(final_df.pos.values, final_df.pos_ratio.values):
#         if pos > 1 and pos_ratio >= pos_ratio_limit:
#             clustering_score += 1
            
#     final_df = final_df[((final_df['pos'] > 1) & (final_df['pos_ratio'] >= pos_ratio_limit)) | (final_df['cluster_id'] == -1)]
            
    return final_df

def get_clustering_analysis(df, label_class, col='doc_vec', dimen_size=5, cluster_size=7, topn=10):
    
    vecs = []
    for vec in df[col].values:
        vecs.append(vec.tolist())
    vec_array = np.array(vecs)
    
    vec_array = vec_array.reshape(vec_array.shape[0], vec_array.shape[-1])
    print(vec_array.shape)
    
    umap_output_5, umap_5 = get_umap_output(vec_array, dim_size=dimen_size)
    gmm = get_gmm_output(umap_output_5, cluster_size=cluster_size)

    df['cluster_id'] = gmm.predict(umap_output_5)
    words_per_class = get_topwords_per_class(df, topn)
    
    analysis_df = get_analysis_df(words_per_class, df, label_class)
    
    return analysis_df

In [43]:
analysis_df = get_clustering_analysis(final_df, 'gt_rank', col='doc_vec', dimen_size=5, cluster_size=4, topn=15)
analysis_df

(29, 512)
{0: ['munich_i', 'algorithmen', 'roboters', 'systeme', 'menschliche', 'tu', 'automatica', 'pitfall', 'forscher', 'roboter', 'robotik', 'lernen', 'explore', 'go', 'ki'], 1: ['hand', 'nicobo', 'omron', 'bewegungen', 'roboterhand', 'forscher', 'cassie', 'experiment', 'konferenz', 'können', 'robotik', 'robotern', 'university', 'menschen', 'roboter'], 2: ['baustellen', 'robmosys', 'steuerung', 'report', 'polizei', 'werkzeuge', 'rahmen', 'bauroboter', 'entwicklung', 'software', 'unternehmen', 'rheinmetall', 'ros', 'werden', 'roboter'], 3: ['wasser', 'bewegen', 'werden', 'einsatz', 'gelände', 'exomy', 'rheinmetall', 'robotik', 'roboter', 'sparrow', 'nasa', 'esa', 'slothbot', 'master', 'mission']}


Unnamed: 0,cluster_id,irrelevant,partially relevant,relevant,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0,3,3,2,munich_i,algorithmen,roboters,systeme,menschliche,tu,automatica,pitfall,forscher,roboter,robotik,lernen,explore,go,ki
1,1,2,6,1,hand,nicobo,omron,bewegungen,roboterhand,forscher,cassie,experiment,konferenz,können,robotik,robotern,university,menschen,roboter
2,2,1,3,3,baustellen,robmosys,steuerung,report,polizei,werkzeuge,rahmen,bauroboter,entwicklung,software,unternehmen,rheinmetall,ros,werden,roboter
3,3,3,1,1,wasser,bewegen,werden,einsatz,gelände,exomy,rheinmetall,robotik,roboter,sparrow,nasa,esa,slothbot,master,mission


In [26]:
def get_clustering_result(label, cluster_cnt):
    
    train_df_sample = train_df[train_df['label'] == label]
    analysis_df = get_clustering_analysis(train_df_sample, 'label', col='doc_vec', dimen_size=5, cluster_size=cluster_cnt, topn=15)
    
    return analysis_df

In [None]:
get_clustering_result(1, cluster_cnt=20)

In [11]:
topic_model = BERTopic(embedding_model="sentence-transformers/paraphrase-xlm-r-multilingual-v1")
topics, probs = topic_model.fit_transform(final_df.text.values)

In [12]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,21,-1_die_der_und_in


In [14]:
topic_model.visualize_topics()

IndexError: arrays used as indices must be of integer (or boolean) type