In [3]:
from typing import List, Tuple
import random
import os, sys
sys.path.append(os.path.abspath('..'))

from tqdm import tqdm
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy
import torch
import torch.nn
from sentence_transformers import SentenceTransformer

from fame.text_processing.text_processor import TextProcessor
from fame.text_processing.token_processor import TokenProcessor
from fame.topic_modeling.cortex.model.autoencoder import MLPAutoEncoder
from fame.topic_modeling.cortex.pipeline.bert_lda import TransformerLDATopicModelingPipeline



In [4]:
number_of_lda_topics = 10

In [5]:
autoencoder = MLPAutoEncoder(
    input_output_dim = 768 + number_of_lda_topics,
    hidden_layers = [512, 64, 32]
)

In [6]:
pipeline = TransformerLDATopicModelingPipeline(
    autoencoder=autoencoder
)

In [7]:
sample_corpus = []

for _ in range(200):
    sample_corpus += ["that is idiotic funny up", "i wanted to tell you this is wrong my friend"]

In [8]:
preprocessed_text_list, preprocessed_tokens_list, indices = pipeline.preprocess_and_get_text_and_tokens(text_list=sample_corpus)

100%|██████████| 400/400 [00:00<00:00, 1367.38it/s]


In [9]:
pipeline.prepare_lda_model(tokens_list=preprocessed_tokens_list)

In [10]:
reps = pipeline.get_stacked_representations(
    text_list=preprocessed_text_list, 
    tokens_list=preprocessed_tokens_list)

In [11]:
reps.shape

(400, 778)

In [12]:
pipeline.prepare_autoencoder(
    text_list=preprocessed_text_list,
    tokens_list=preprocessed_tokens_list,
    number_of_epochs=5,
    batch_size=56,
    shuffle=True
)

Epoch: 4 / Loss: 0.17547624558210373: 100%|██████████| 5/5 [00:00<00:00,  6.42it/s]


[1.1813539266586304,
 0.5059151686728001,
 0.31697307527065277,
 0.23842306435108185,
 0.17547624558210373]

In [13]:
preprocessed_text_list, preprocessed_tokens_list, indices = pipeline.preprocess_and_get_text_and_tokens(["this is a new one"])

100%|██████████| 1/1 [00:00<00:00, 676.17it/s]


In [14]:
reps = pipeline.get_stacked_representations(text_list = preprocessed_text_list, tokens_list=preprocessed_tokens_list)

In [15]:
reps.shape

(1, 778)

In [18]:
pipeline.train_clustering(numpy.random.rand(5, 778))

In [19]:
pipeline.label_representation_cluster(numpy.random.rand(5, 778))

array([0, 1, 1, 0, 4], dtype=int32)