In [1]:
%%capture
!pip install contextualized-topic-models==2.2.0


In [2]:
%%capture
!pip install pyldavis

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:

try:
  import google.colab
  IN_COLAB = True
  

  from google.colab import drive
  drive.mount('/content/gdrive')
except:
  IN_COLAB = False

Mounted at /content/gdrive


In [5]:
#@title Link your assignment folder & install requirements
#@markdown Enter the path to the assignment folder in your Google Drive
# If you run this notebook locally or on a cluster (i.e. not on Google Colab)
# you can delete this cell which is specific to Google Colab. You may also
# change the paths for data/logs in Arguments below.
import sys
import os
import shutil
import warnings
import torch
if IN_COLAB:
    folder = "/content/gdrive/MyDrive/causal-text" #@param {type:"string"}
    !ln -Ts $folder /content/causal-text 2> /dev/null

    # Add the assignment folder to Python path
    if '/content/causal-text/src' not in sys.path:
        sys.path.insert(0, '/content/causal-text/src')

    # Install requirements
    # !pip install -qr /content/MILA_BOYS_FINAL/requirements.txt
else:
    sys.path.insert(0, './')

# Check if CUDA is available
if not torch.cuda.is_available():
    warnings.warn('CUDA is not available.')


In [6]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
CUDA=True

In [7]:
# Importing what we need
from contextualized_topic_models.models.ctm import ZeroShotTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing
import nltk
import pandas as pd
import torch
import numpy as np
import random

In [8]:
#DATA_DIR = r'Q:\TIGER\patches\data'
if IN_COLAB:
    ROOT_DIR = 'causal-text/src/'
    DATA_DIR = "MILA_BOYS_FINAL/Data/process_data"
else:
    ROOT_DIR = './'
    DATA_DIR = "../Data/process_data"

In [9]:
def fix_seeds():
  torch.manual_seed(10)
  torch.cuda.manual_seed(10)
  np.random.seed(10)
  random.seed(10)
  torch.backends.cudnn.enabled = False
  torch.backends.cudnn.deterministic = True
fix_seeds()

### Checking Data

In [10]:
df = pd.read_csv(f"{ROOT_DIR}/music_preprocessed.csv")
df['text'] = df['text'].map(lambda x: x.lower() if isinstance(x,str) else x)

In [11]:
df[df['text'].str.contains(" da ")]
# .filter(lambda x: x.contains(' da')).head()

Unnamed: 0.1,Unnamed: 0,index,id,rating,product,text,summary,price,T_true,T_proxy,C_true,Y_sim,Y_1,Y_0
1516,1522,4293,B0028C3AKA,5.0,mp3 music,i just saw black violin perform a show here in...,Black Violin is w/o equal. THESE BROTHA'S ARE ...,59.67,1,1,0,0,0,0
1534,1540,4330,B002F4XW18,5.0,mp3 music,"this is an mp3 version of the 20-cd set ""the g...","Rare gems, fine performances, bargain basement...",1.39,1,1,0,1,1,0
1535,1541,4336,B000ELBNC4,2.0,audio cd,r u serious? jeezy gettin mo and mo commercial...,Dis cant be Jeezyyyyy,39.99,0,0,1,1,1,1
6209,6227,16855,B000025C4P,5.0,audio cd,i have owned and loved this recording since it...,Mature Handel sung by an unbeatable cast,25.62,1,1,1,1,1,1
6238,6256,16967,B00002607F,5.0,audio cd,i love lute music. and da milano compositions ...,Beautiful Music,12.19,1,1,1,1,0,0
6448,6467,17674,B00004SX1K,5.0,audio cd,i think its great and catchy -grandkids love i...,really good,4.28,1,1,1,1,1,0
6873,6893,19078,B00006YXDO,5.0,audio cd,"mos def [2002] we are hip-hop... me, you, ever...","Best of Mos Def, mos def.",59.99,1,1,1,0,1,0
7056,7076,19631,B0000QWYQW,5.0,audio cd,i am amazed that rca did not actually issue th...,At last the three early symphonies with Ormandy.,19.53,1,1,1,0,1,0
7157,7177,19987,B0001PICSG,5.0,mp3 music,for fans of da their early material contained ...,Love early DA,54.86,1,1,0,1,1,1
7163,7183,20006,B0001UCS86,5.0,audio cd,graziano mandozzi performs on the synthesizer ...,Track Listing and Other Info.,4.03,1,1,1,0,1,0


In [12]:
from nltk.corpus import stopwords as stop_words
nltk.download('stopwords')

sp = WhiteSpacePreprocessing(df['text'], stopwords_language='english')
sp.stopwords = sp.stopwords.union(set(stop_words.words("french"))) ## Adding French words to stopwords
preprocessed_documents, unpreprocessed_corpus, vocab = sp.preprocess()

### TODO: Shift this inherited class to a new python file

In [16]:
import numpy as np
from sentence_transformers import SentenceTransformer
import scipy.sparse
import warnings
from contextualized_topic_models.datasets.dataset import CTMDataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
# from CausalBert import CausalBert
from transformers import DistilBertTokenizer
from transformers import DistilBertModel, DistilBertPreTrainedModel
from tqdm.autonotebook import trange
from typing import List, Dict, Tuple, Iterable, Type, Union, Callable, Optional
from torch import Tensor
from numpy import ndarray
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation

def batch_to_device(batch, target_device):
    """
    send a pytorch batch to a device (CPU/GPU)
    """
    for key in batch:
        if isinstance(batch[key], torch.Tensor):
            batch[key] = batch[key].to(target_device)
    return batch
class MySentenceTransformer(SentenceTransformer):
    def encode(self, sentences: Union[str, List[str]],
               batch_size: int = 32,
               show_progress_bar: bool = None,
               output_value: str = 'sentence_embedding',
               convert_to_numpy: bool = True,
               convert_to_tensor: bool = False,
               device: str = None,
               normalize_embeddings: bool = False) -> Union[List[Tensor], ndarray, Tensor]:
        """
        Computes sentence embeddings

        :param sentences: the sentences to embed
        :param batch_size: the batch size used for the computation
        :param show_progress_bar: Output a progress bar when encode sentences
        :param output_value:  Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values
        :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors.
        :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy
        :param device: Which torch.device to use for the computation
        :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.

        :return:
           By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned.
        """
        self.eval()
        if convert_to_tensor:
            convert_to_numpy = False

        if output_value != 'sentence_embedding' or output_value != 'cls_embedding':
            convert_to_tensor = False
            convert_to_numpy = False

        input_was_string = False
        if isinstance(sentences, str) or not hasattr(sentences, '__len__'): #Cast an individual sentence to a list with length 1
            sentences = [sentences]
            input_was_string = True

        if device is None:
            device = self._target_device

        self.to(device)

        all_embeddings = []
        length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences])
        sentences_sorted = [sentences[idx] for idx in length_sorted_idx]

        for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
            sentences_batch = sentences_sorted[start_index:start_index+batch_size]
            features = self.tokenize(sentences_batch)
            features = batch_to_device(features, device)

            with torch.no_grad():
                out_features = self.forward(features)
                # print(out_features)
                if output_value == 'cls_embedding':
                    embeddings = out_features['token_embeddings'][:,0]
                    embeddings = embeddings.detach()
                    # fixes for #522 and #487 to avoid oom problems on gpu with large datasets
                    # if convert_to_numpy:
                    embeddings = embeddings.cpu()
                elif output_value == 'token_embeddings':
                    embeddings = []
                    for token_emb, attention in zip(out_features[output_value], out_features['attention_mask']):
                        last_mask_id = len(attention)-1
                        while last_mask_id > 0 and attention[last_mask_id].item() == 0:
                            last_mask_id -= 1

                        embeddings.append(token_emb[0:last_mask_id+1])
                elif output_value is None:  #Return all outputs
                    embeddings = []
                    for sent_idx in range(len(out_features['sentence_embedding'])):
                        row =  {name: out_features[name][sent_idx] for name in out_features}
                        embeddings.append(row)
                else:   #Sentence embeddings
                    embeddings = out_features[output_value]
                    embeddings = embeddings.detach()
                    if normalize_embeddings:
                        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)

                    # fixes for #522 and #487 to avoid oom problems on gpu with large datasets
                    if convert_to_numpy:
                        embeddings = embeddings.cpu()

                all_embeddings.extend(embeddings)

        all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)]

        if convert_to_tensor:
            all_embeddings = torch.stack(all_embeddings)
        elif convert_to_numpy:
            all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])

        if input_was_string:
            all_embeddings = all_embeddings[0]

        return all_embeddings


In [17]:

def my_bert_embeddings_from_list(texts, sbert_model_to_load, batch_size=200):
    """
    Creates BERT Embeddings from a list
    """
    output_value = 'token_embeddings'
    model = MySentenceTransformer(sbert_model_to_load)
    return np.array(model.encode(texts, show_progress_bar=True,batch_size=batch_size,output_value='cls_embedding'))
class MyTopicModelDataPreparation(TopicModelDataPreparation):
    def fit(self, text_for_contextual, text_for_bow, labels=None):
        """
        This method fits the vectorizer and gets the embeddings from the contextual model

        :param text_for_contextual: list of unpreprocessed documents to generate the contextualized embeddings
        :param text_for_bow: list of preprocessed documents for creating the bag-of-words
        :param labels: list of labels associated with each document (optional).

        """

        if self.contextualized_model is None:
            raise Exception("You should define a contextualized model if you want to create the embeddings")

        # TODO: this count vectorizer removes tokens that have len = 1, might be unexpected for the users
        self.vectorizer = CountVectorizer()

        train_bow_embeddings = self.vectorizer.fit_transform(text_for_bow)
        train_contextualized_embeddings = my_bert_embeddings_from_list(text_for_contextual, self.contextualized_model)
        self.vocab = self.vectorizer.get_feature_names()
        self.id2token = {k: v for k, v in zip(range(0, len(self.vocab)), self.vocab)}

        if labels:
            self.label_encoder = OneHotEncoder()
            encoded_labels = self.label_encoder.fit_transform(np.array([labels]).reshape(-1, 1))
        else:
            encoded_labels = None

        return CTMDataset(train_contextualized_embeddings, train_bow_embeddings, self.id2token, encoded_labels)


    def transform(self, text_for_contextual, text_for_bow=None, custom_embeddings=None, labels=None):
        """
        This method create the input for the prediction. Essentially, it creates the embeddings with the contextualized
        model of choice and with trained vectorizer.
        If text_for_bow is missing, it should be because we are using ZeroShotTM
        :param text_for_contextual: list of unpreprocessed documents to generate the contextualized embeddings
        :param text_for_bow: list of preprocessed documents for creating the bag-of-words
        :param custom_embeddings: np.ndarray type object to use custom embeddings (optional).
        :param labels: list of labels associated with each document (optional).
        """

        if custom_embeddings is not None:
            assert len(text_for_contextual) == len(custom_embeddings)

            if text_for_bow is not None:
                assert len(custom_embeddings) == len(text_for_bow)

        if text_for_bow is not None:
            assert len(text_for_contextual) == len(text_for_bow)

        if self.contextualized_model is None:
            raise Exception("You should define a contextualized model if you want to create the embeddings")

        if text_for_bow is not None:
            test_bow_embeddings = self.vectorizer.transform(text_for_bow)
        else:
            # dummy matrix
            if self.show_warning:
                warnings.simplefilter('always', DeprecationWarning)
                warnings.warn("The method did not have in input the text_for_bow parameter. This IS EXPECTED if you "
                          "are using ZeroShotTM in a cross-lingual setting")

            # we just need an object that is matrix-like so that pytorch does not complain
            test_bow_embeddings = scipy.sparse.csr_matrix(np.zeros((len(text_for_contextual), 1)))

        if custom_embeddings is None:
            test_contextualized_embeddings = my_bert_embeddings_from_list(text_for_contextual, self.contextualized_model)
        else:
            test_contextualized_embeddings = custom_embeddings

        if labels:
            encoded_labels = self.label_encoder.transform(np.array([labels]).reshape(-1, 1))
        else:
            encoded_labels = None

        return CTMDataset(test_contextualized_embeddings, test_bow_embeddings, self.id2token, encoded_labels)




## Preparing Contextualised Training Set for Topic Model

In [19]:
tp = MyTopicModelDataPreparation("distilbert-base-uncased")


In [20]:
unpreprocessed_corpus[0]

'clever,inspired and moving. this is a great album from one of the finest christian artists ever. god took him home early, but not before leaving us with a great message and music. keith green makes bible lessons come to life with his music.'

In [21]:
preprocessed_documents[0]

'clever inspired moving great album one finest christian artists ever god took home early leaving us great message music keith green makes come life music'

In [35]:
# tp = TopicModelDataPreparation("paraphrase-distilroberta-base-v1")


training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.54k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Batches:   0%|          | 0/107 [00:00<?, ?it/s]

  
  


### Estimationg Number of Topics using Coherence score

In [None]:
# from contextualized_topic_models.evaluation.measures import CoherenceNPMI, InvertedRBO
# corpus = [d.split() for d in preprocessed_documents]

# num_topics = [5, 10, 15, 20]
# num_runs = 5

# best_topic_coherence = -999
# best_num_topics = 0
# for n_components in num_topics:
#   for i in range(num_runs):
#     print("num topics:", n_components, "/ num run:", i)
#     ctm = ZeroShotTM(bow_size=len(tp.vocab), contextual_size=768, 
#                      n_components=n_components, num_epochs=50)
#     ctm.fit(training_dataset) # run the model
#     coh = CoherenceNPMI(ctm.get_topic_lists(10), corpus)
#     coh_score = coh.score()
#     print("coherence score:", coh_score)
#     if best_topic_coherence < coh_score:
#       best_topic_coherence = coh_score
#       best_num_topics = n_components
#     print("current best coherence", best_topic_coherence, "/ best num topics", best_num_topics)

### Fit Topic Model

In [38]:
ctm = ZeroShotTM(bow_size=len(tp.vocab), contextual_size=768, n_components=20, num_epochs=100)


In [None]:
ctm.fit(training_dataset) # run the model

Epoch: [100/100]	 Seen Samples: [2120200/2120200]	Train Loss: 196.27857510448234	Time: 0:00:08.410657: : 100it [14:52,  8.93s/it]


In [None]:
ctm.save(f"{ROOT_DIR}/ctm_full_new")



In [None]:
ctm.get_topic_lists(5)

[['like', 'song', 'album', 'killers', 'band'],
 ['hair', 'bowie', 'spring', 'devil', 'blu'],
 ['songs', 'song', 'music', 'cd', 'video'],
 ['return', 'order', 'ordered', 'received', 'com'],
 ['fast', 'described', 'delivery', 'quickly', 'shipping'],
 ['hair', 'mark', 'following', 'serious', 'spring'],
 ['songs', 'music', 'love', 'voice', 'song'],
 ['mix', 'hip', 'remix', 'hop', 'club'],
 ['music', 'car', 'relaxing', 'kids', 'listen'],
 ['music', 'one', 'score', 'film', 'surround'],
 ['vinyl', 'quality', 'cd', 'original', 'sound'],
 ['lord', 'god', 'jesus', 'christ', 'presence'],
 ['live', 'great', 'band', 'rock', 'must'],
 ['album', 'dolly', 'songs', 'years', 'record'],
 ['product', 'album', 'arp', 'nbsp', 'cr'],
 ['beethoven', 'violin', 'concerto', 'movement', 'symphony'],
 ['beautiful', 'recommend', 'young', 'highly', 'voice'],
 ['floyd', 'pink', 'wright', 'album', 'endless'],
 ['memories', 'glad', 'happy', 'childhood', 'kid'],
 ['love', 'orch', 'victor', 'billboard', 'side']]

In [None]:
lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset,20)

Sampling: [20/20]: : 20it [02:18,  6.95s/it]


In [None]:
import pyLDAvis as vis
movies_pd = vis.prepare(**lda_vis_data)
vis.display(movies_pd)

  from collections import Iterable
  from collections import Mapping
  return f(*args, **kwds)
  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [None]:
train_topics_predictions = ctm.get_thetas(training_dataset, n_samples=5) # get all the topic predictions

Sampling: [5/5]: : 5it [00:37,  7.47s/it]


In [None]:
preprocessed_documents[0] # see the text of our preprocessed document

'clever inspired moving great album one finest christian artists ever god took home early leaving us great message music keith green makes come life music'

In [23]:
import numpy as np
train_topic_number = np.argmax(train_topics_predictions[0]) # get the topic id of the first document

NameError: ignored

In [None]:
ctm.get_topic_lists(5)[train_topic_number] #and the topic should be about natural location related things

['lord', 'god', 'jesus', 'christ', 'presence']

In [22]:
def get_topic(unpreprocessed_corpus,preprocessed_corpus,ctm,topics_predictions):
    # df = pd.Series()
    topic_dict= {}
    topic_dict['text'] = unpreprocessed_corpus
    topic_dict['preprocessed_text'] = preprocessed_corpus
    topic_dict['topic_probs'] = list(topics_predictions)
    # df['best_topic'] = ctm.predict(df)

    best_topic = []
    topics = []
    for i in trange(len(unpreprocessed_corpus)):
        best_topic_number = np.argmax(topics_predictions[i]) # get the topic id of the first document
        dtopics = ctm.get_topic_lists(5)[best_topic_number] #and the topic should be about natural location related things
        topics.append(dtopics)
        best_topic.append(best_topic_number)
        # break
    topic_dict['best_topic'] = best_topic
    topic_dict['topics'] = topics
    df = pd.DataFrame(topic_dict)

    return df

In [None]:
train_topic_df

Unnamed: 0,text,preprocessed_text,topic_probs,best_topic,topics
0,"clever,inspired and moving. this is a great al...",clever inspired moving great album one finest ...,"[0.062458381056785583, 0.010888376832008361, 0...",11,"[lord, god, jesus, christ, presence]"
1,keith green is a bit of legend in some christi...,keith green bit christian music influenced man...,"[0.022196136973798276, 0.004253175389021635, 0...",6,"[songs, music, love, voice, song]"
2,buy the cd. do not buy the mp3 album. downlo...,buy cd buy album download longer available fin...,"[0.00809968588873744, 0.025013037398457526, 0....",3,"[return, order, ordered, received, com]"
3,takes me back to my childhood!,takes back childhood,"[0.04591881223022938, 0.0995884284377098, 0.00...",5,"[hair, mark, following, serious, spring]"
4,i have fallen in love with john michael talbot...,love john michael music sing anything make pag...,"[0.015955587988719343, 0.04268949218094349, 0....",4,"[fast, described, delivery, quickly, shipping]"
...,...,...,...,...,...
21197,brilliant set list and the guys sound awesome ...,brilliant set list guys sound awesome usual li...,"[0.08793381154537201, 0.004686620691791177, 0....",2,"[songs, song, music, cd, video]"
21198,"the best kiss album yet,for the best price.",best kiss album yet best price,"[0.022170749492943287, 0.2010623760521412, 0.0...",1,"[hair, bowie, spring, devil, blu]"
21199,the reading of his poems are monotonous.,reading,"[0.008396916906349361, 0.33980768322944643, 0....",1,"[hair, bowie, spring, devil, blu]"
21200,anton batagov is a renowned composer and piani...,composer pianist taken four glass compositions...,"[0.013730157958343625, 0.002508265455253422, 0...",9,"[music, one, score, film, surround]"


In [None]:
train_topic_df = get_topic(unpreprocessed_corpus,preprocessed_documents,ctm,train_topics_predictions)



  0%|          | 0/21202 [00:00<?, ?it/s]

In [None]:
train_topic_df.to_csv(f"{ROOT_DIR}/train_topics.csv") 

## Loading Causal Bert with Casual Embeddings

In [24]:
import CausalBert
import torch

In [25]:
cbw = CausalBert.CausalBertWrapper(g_weight=0, Q_weight=0.1, mlm_weight=1)


Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of CausalBert were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['Q_cls.0.0.bias', 'g_cls.weight', 'Q_cls.1.0.bias', 'Q_cls.0.2.weight', 'g_cls.bias', 'Q_cls.1.0.weight', 'Q_cls.0.0.weight', 'Q_cls.1.2.weight', 'Q_cls.0.2.bias', 'Q_cls.1.2.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model on GPU 0


In [26]:
cbw.model.load_state_dict(torch.load(f'{ROOT_DIR}/cb_T_plus_reg_no_mask.pt'))

<All keys matched successfully>

In [27]:
from transformers import DistilBertTokenizer, DistilBertModel
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = cbw.model.distilbert
# DistilBertModel.from_pretrained("distilbert-base-uncased")
# text = "Replace me by any text you'd like."
# encoded_input = tokenizer(text, return_tensors='pt')
# output = model(**encoded_input)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [28]:
from tqdm import tqdm
from collections import defaultdict
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler

def build_dataloader(texts, treatments=None, outcomes=None,
      tokenizer=None, sampler=None):
      def collate_CandT(data):
          # sort by (C, T), so you can get boundaries later
          # (do this here on cpu for speed)
          data.sort(key=lambda x: (x[1], x[2]))
          # boundaries = []
          # prev = None
          # data_by_level = defaultdict(list)
          # for i, x in enumerate(data):
          #     cur = x[1], x[2]
          #     data_by_level['%d%d' % (x[1].item(), x[2].item())].append(x)

          # data_by_level = {k: v for k, v in data_by_level.items()}
          # return data_by_level
          return data

      if tokenizer is None:
          tokenizer = DistilBertTokenizer.from_pretrained(
              'distilbert-base-uncased', do_lower_case=True)

      out = defaultdict(list)
      for i, W in enumerate(texts):
          # out['W_raw'].append(W)
          encoded_sent = tokenizer.encode_plus(W, add_special_tokens=True,
              max_length=128,
              pad_to_max_length=True)

          out['W_ids'].append(encoded_sent['input_ids'])
          out['W_mask'].append(encoded_sent['attention_mask'])
          out['W_len'].append(sum(encoded_sent['attention_mask']))

      data = (torch.tensor(out[x]) for x in ['W_ids', 'W_len', 'W_mask'])
      data = TensorDataset(*data)
      sampler = RandomSampler(data) if sampler == 'random' else SequentialSampler(data)
      dataloader = DataLoader(data, sampler=sampler, batch_size=32)
          # collate_fn=collate_CandT)

      return dataloader


In [29]:
test_dataloader = build_dataloader(unpreprocessed_corpus)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [30]:
torch.cuda.empty_cache()
# del all_embeddings
# del pooled_output

## Loading Causal Embeddings 

In [31]:
all_embeddings = []
for step, batch in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
      if CUDA: 
          batch = (x.cuda() for x in batch)
      W_ids, W_len, W_mask = batch
      # while True:
      model.eval()
      with torch.no_grad():
         outputs = model(W_ids, attention_mask=W_mask)
         seq_output = outputs[0]
         pooled_output = seq_output[:, 0].detach().cpu().numpy()
         all_embeddings.extend(pooled_output)



100%|██████████| 663/663 [01:15<00:00,  8.73it/s]


In [36]:
testing_dataset = tp.transform(text_for_contextual=unpreprocessed_corpus,text_for_bow =preprocessed_documents ,custom_embeddings=all_embeddings)

## RELOADING CTM Trained Before
* We load the CTM trained before and use it on perturbed embeddings (which are causally sufficient) . To estimate how the topic distribution has changed for same sentences. Therefore getting understanding the confounding topics

In [39]:
path = "ctm_full_new/contextualized_topic_model_nc_20_tpm_0.0_tpv_0.95_hs_prodLDA_ac_(100, 100)_do_softplus_lr_0.2_mo_0.002_rp_0.99"
# path0 = "ctm_full/contextualized_topic_model_nc_50_tpm_0.0_tpv_0.98_hs_prodLDA_ac_(100, 100)_do_softplus_lr_0.2_mo_0.002_rp_0.99"
ctm.load(f"{ROOT_DIR}/{path}",99)



In [40]:
lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, testing_dataset,20)

Sampling: [20/20]: : 20it [01:42,  5.15s/it]


In [41]:
# lda_vis_data

In [42]:
import pyLDAvis as vis
movies_pd = vis.prepare(**lda_vis_data)
vis.display(movies_pd)

  from collections import Iterable
  from collections import Mapping
  return f(*args, **kwds)
  by='saliency', ascending=False).head(R).drop('saliency', 1)


## Inference on Causal Embeddings

In [None]:
test_topics_predictions = ctm.get_thetas(testing_dataset, n_samples=5) # get all the topic predictions


Sampling: [5/5]: : 5it [00:38,  7.69s/it]


In [None]:
unpreprocessed_corpus[0]

'clever,inspired and moving. this is a great album from one of the finest christian artists ever. god took him home early, but not before leaving us with a great message and music. keith green makes bible lessons come to life with his music.'

In [None]:
ctm.get_topic_lists(5)

[['like', 'song', 'album', 'killers', 'band'],
 ['hair', 'bowie', 'spring', 'devil', 'blu'],
 ['songs', 'song', 'music', 'cd', 'video'],
 ['return', 'order', 'ordered', 'received', 'com'],
 ['fast', 'described', 'delivery', 'quickly', 'shipping'],
 ['hair', 'mark', 'following', 'serious', 'spring'],
 ['songs', 'music', 'love', 'voice', 'song'],
 ['mix', 'hip', 'remix', 'hop', 'club'],
 ['music', 'car', 'relaxing', 'kids', 'listen'],
 ['music', 'one', 'score', 'film', 'surround'],
 ['vinyl', 'quality', 'cd', 'original', 'sound'],
 ['lord', 'god', 'jesus', 'christ', 'presence'],
 ['live', 'great', 'band', 'rock', 'must'],
 ['album', 'dolly', 'songs', 'years', 'record'],
 ['product', 'album', 'arp', 'nbsp', 'cr'],
 ['beethoven', 'violin', 'concerto', 'movement', 'symphony'],
 ['beautiful', 'recommend', 'young', 'highly', 'voice'],
 ['floyd', 'pink', 'wright', 'album', 'endless'],
 ['memories', 'glad', 'happy', 'childhood', 'kid'],
 ['love', 'orch', 'victor', 'billboard', 'side']]

In [None]:
topic_number = np.argmax(test_topics_predictions[0]) # get the topic id of the first document
ctm.get_topic_lists(10)[topic_number]

['lord',
 'god',
 'jesus',
 'christ',
 'presence',
 'kim',
 'faith',
 'bless',
 'praise',
 'heart']

In [None]:
test_topics_predictions[0]

array([6.09411545e-04, 2.73887303e-03, 7.59259920e-04, 1.28065728e-03,
       1.30734409e-03, 3.18439590e-03, 3.10614213e-02, 1.18144651e-03,
       7.16131696e-04, 1.52384148e-03, 1.46940770e-04, 6.49448550e-01,
       1.15196188e-03, 2.15917903e-03, 1.26393342e-03, 1.01093971e-03,
       2.93447673e-01, 1.10968813e-03, 7.58860097e-04, 5.13946943e-03])

In [None]:
test_topic_df = get_topic(unpreprocessed_corpus,preprocessed_documents,ctm,test_topics_predictions)
test_topic_df.to_csv(f"{ROOT_DIR}/test_topics.csv")

  0%|          | 0/21202 [00:00<?, ?it/s]

In [None]:
train_topic_df["test_topics"] = test_topic_df["topics"]
train_topic_df.to_csv(f"{ROOT_DIR}/combined_topics.csv")

In [None]:
topics_predictions[0]

array([0.06980675, 0.14921305, 0.05031555, 0.11567167, 0.03954012,
       0.04518141, 0.19086739, 0.17571033, 0.04424958, 0.11944416])

In [None]:
ctm.get_topic_lists(10)[6]

['de',
 'la',
 'talented',
 'que',
 'wonderful',
 'en',
 'del',
 'el',
 'voices',
 'los']

In [None]:
topic_number

7