# Exp_9:Topic Modeling Using Tensorflow

## Obj: To convert the documents into topics with tensorflow framework.

In [1]:
import tensorflow as tf
import numpy as np

In [2]:
# Sample documents
documents = [
    "machine learning is a subset of artificial intelligence",
    "tensorflow is a popular machine learning framework",
    "topic modeling is an important technique in NLP",
    "deep learning is a subset of machine learning",
    "natural language processing is a part of NLP",
]

documents = [
    "Koneru Lakshmiah University is one of the oldest universities in the india.",
    "indian has a rich history dating back to the 10th century.",
    "Koneru Lakshmiah University is renowned for its contributions to science and engineering.",
    "engineering was founded in 1701 and has a distinguished history.",
    "The history of the vijayawada can be traced back to the 12th century.",
    "The history of the Koneru Lakshmiah University dates back to the 20th century.",
]

In [3]:
len(documents)

5

# Tokenize the documents

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(documents)
#word_index = tokenizer.word_index
#print(word_index)

# Build a Vocabulary from the tokenized text

In [6]:
vocab = tokenizer.word_index
word_counts = tokenizer.word_counts
#vocab dictionary contains the mapping of words to integer values, 
#word_counts contains the count of each word in your text data.

In [8]:
vocab, word_counts

({'is': 1,
  'learning': 2,
  'a': 3,
  'machine': 4,
  'of': 5,
  'subset': 6,
  'nlp': 7,
  'artificial': 8,
  'intelligence': 9,
  'tensorflow': 10,
  'popular': 11,
  'framework': 12,
  'topic': 13,
  'modeling': 14,
  'an': 15,
  'important': 16,
  'technique': 17,
  'in': 18,
  'deep': 19,
  'natural': 20,
  'language': 21,
  'processing': 22,
  'part': 23},
 OrderedDict([('machine', 3),
              ('learning', 4),
              ('is', 5),
              ('a', 4),
              ('subset', 2),
              ('of', 3),
              ('artificial', 1),
              ('intelligence', 1),
              ('tensorflow', 1),
              ('popular', 1),
              ('framework', 1),
              ('topic', 1),
              ('modeling', 1),
              ('an', 1),
              ('important', 1),
              ('technique', 1),
              ('in', 1),
              ('nlp', 2),
              ('deep', 1),
              ('natural', 1),
              ('language', 1),
              ('pro

# Convert the above teokenized documents into bag of words (BOW) representation

### First convert to sequences of sentences

In [9]:
sequences = tokenizer.texts_to_sequences(documents)

In [10]:
sequences

[[4, 2, 1, 3, 6, 5, 8, 9],
 [10, 1, 3, 11, 4, 2, 12],
 [13, 14, 1, 15, 16, 17, 18, 7],
 [19, 2, 1, 3, 6, 5, 4, 2],
 [20, 21, 22, 1, 3, 23, 5, 7]]

### Then Generate BOW Representation

In [11]:
num_of_words = len(word_counts)
num_of_words

23

##### Initialize a empty matrix to store BOW features

In [12]:
BOW_Features = []
for sequence in sequences:
    BOW_Vector = [0]*num_of_words
    for word_index in sequence:
        BOW_Vector[word_index - 1] += 1
    BOW_Features.append(BOW_Vector)

In [14]:
BOW_Features

[[1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
 [1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
 [1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]]

# We can also use Term Frequency (TF) and Inverse Document Frequency to Compute BOW

### Calculate TF

In [15]:
tf_matrix = np.zeros((len(documents), len(vocab)), dtype=np.float32)
for i, sentence in enumerate(documents):
    words = sentence.lower().split()
    for j, word in enumerate(vocab):
        tf_matrix[i, j] = words.count(word) / len(words)
        print(len(words))
        print(words.count(word))

8
1
8
1
8
1
8
1
8
1
8
1
8
0
8
1
8
1
8
0
8
0
8
0
8
0
8
0
8
0
8
0
8
0
8
0
8
0
8
0
8
0
8
0
8
0
7
1
7
1
7
1
7
1
7
0
7
0
7
0
7
0
7
0
7
1
7
1
7
1
7
0
7
0
7
0
7
0
7
0
7
0
7
0
7
0
7
0
7
0
7
0
8
1
8
0
8
0
8
0
8
0
8
0
8
1
8
0
8
0
8
0
8
0
8
0
8
1
8
1
8
1
8
1
8
1
8
1
8
0
8
0
8
0
8
0
8
0
8
1
8
2
8
1
8
1
8
1
8
1
8
0
8
0
8
0
8
0
8
0
8
0
8
0
8
0
8
0
8
0
8
0
8
0
8
1
8
0
8
0
8
0
8
0
8
1
8
0
8
1
8
0
8
1
8
0
8
1
8
0
8
0
8
0
8
0
8
0
8
0
8
0
8
0
8
0
8
0
8
0
8
0
8
1
8
1
8
1
8
1


In [16]:
tf_matrix

array([[0.125     , 0.125     , 0.125     , 0.125     , 0.125     ,
        0.125     , 0.        , 0.125     , 0.125     , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.14285715,
        0.14285715, 0.14285715, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.125     , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.125     , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.125     , 0.125     , 0.125     ,
        0.125     , 0.125     , 0.125     , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.125     , 0.25      , 0.125     , 0.125

In [17]:
tf_matrix.shape

(5, 23)

### Calculate IDF

In [19]:
idf_vector = np.zeros(len(vocab), dtype=np.float32)
total_documents = len(documents)
for j, word in enumerate(vocab):
    doc_count = sum(1 for sentence in documents if word in sentence.split())
    idf_vector[j] = np.log(total_documents / (1 + doc_count))

In [20]:
idf_vector # 

array([-0.18232156,  0.22314355,  0.        ,  0.22314355,  0.22314355,
        0.51082563,  1.609438  ,  0.91629076,  0.91629076,  0.91629076,
        0.91629076,  0.91629076,  0.91629076,  0.91629076,  0.91629076,
        0.91629076,  0.91629076,  0.91629076,  0.91629076,  0.91629076,
        0.91629076,  0.91629076,  0.91629076], dtype=float32)

In [21]:
idf_vector.shape

(23,)

In [22]:
# Calculate the TF-IDF matrix
tfidf_matrix = tf_matrix * idf_vector
tfidf_matrix

array([[-0.0227902 ,  0.02789294,  0.        ,  0.02789294,  0.02789294,
         0.0638532 ,  0.        ,  0.11453635,  0.11453635,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.02604594,  0.03187765,  0.        ,  0.03187765,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.13089868,
         0.13089868,  0.13089868,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.0227902 ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.20117974,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.11453635,  0.11453635,  0.11453635,
         0.11453635,  0.11453635,  0.11453635,  0.        ,  0.        ,
         0.        ,  0.    

In [23]:
tfidf_matrix.shape

(5, 23)

### TFIDF Using Functions: Bag of Words representation using TF-IDF scores

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
tfidf_matrix

<5x22 sparse matrix of type '<class 'numpy.float64'>'
	with 34 stored elements in Compressed Sparse Row format>

In [25]:
tfidf_tensor = tf.constant(tfidf_matrix.toarray(), dtype=tf.float32)
tfidf_tensor

<tf.Tensor: shape=(5, 22), dtype=float32, numpy=
array([[0.        , 0.4865898 , 0.        , 0.        , 0.        ,
        0.        , 0.4865898 , 0.23186265, 0.        , 0.32587487,
        0.32587487, 0.        , 0.        , 0.        , 0.32587487,
        0.        , 0.        , 0.        , 0.3925776 , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.49242058, 0.        ,
        0.        , 0.        , 0.23464105, 0.        , 0.3297798 ,
        0.3297798 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.49242058, 0.        , 0.        , 0.        ,
        0.49242058, 0.        ],
       [0.3813026 , 0.        , 0.        , 0.        , 0.3813026 ,
        0.3813026 , 0.        , 0.18169272, 0.        , 0.        ,
        0.        , 0.3813026 , 0.        , 0.30763254, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.3813026 ,
        0.        , 0.3813026 ],
       [0.        , 0.        , 0.46

# Lets get back to Topic Modelling using Linear Discriminant Analysis

In [26]:
# Convert to TensorFlow tensors
BOW_Features = tf.constant(BOW_Features, dtype=tf.float32)
BOW_Features

<tf.Tensor: shape=(5, 23), dtype=float32, numpy=
array([[1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1.,
        1., 1., 0., 0., 0., 0., 0.],
       [1., 2., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 1., 1., 1.]], dtype=float32)>

In [27]:
documents

['machine learning is a subset of artificial intelligence',
 'tensorflow is a popular machine learning framework',
 'topic modeling is an important technique in NLP',
 'deep learning is a subset of machine learning',
 'natural language processing is a part of NLP']

# Topics = 2 -- learning, machine 
# Input (BOW) - Output is probability that document 1 belongs to topic learning (0.8) or machine(0.2)

In [28]:
Topics = 2
epochs = 100
alpha = 1.0 # Document - Topic Distribution Hyperparameter
beta = 1.0 # Topic - Word Distribution Hyperparameter

## Initialize Topic - Word Distribution Randomly 

In [29]:
topic_word_distribution = tf.random.uniform((Topics, num_of_words),0,1)
topic_word_distribution

<tf.Tensor: shape=(2, 23), dtype=float32, numpy=
array([[0.4282782 , 0.14814746, 0.22535539, 0.84781325, 0.20984268,
        0.55467784, 0.05056345, 0.85595405, 0.2665887 , 0.54655814,
        0.05033481, 0.172176  , 0.9405519 , 0.28654146, 0.22927582,
        0.3575008 , 0.5543679 , 0.6904756 , 0.20061374, 0.38163912,
        0.8016926 , 0.30266845, 0.14317667],
       [0.23012912, 0.5880227 , 0.6013558 , 0.38890803, 0.47694016,
        0.71845365, 0.6674757 , 0.43920743, 0.2477498 , 0.82374537,
        0.9609926 , 0.28887117, 0.19562697, 0.7963016 , 0.52349126,
        0.06840408, 0.15503561, 0.454558  , 0.8707397 , 0.12246466,
        0.20856857, 0.04432523, 0.859408  ]], dtype=float32)>

In [30]:
topic_word_distribution /= tf.reduce_sum(topic_word_distribution, axis=1, keepdims=True)
topic_word_distribution

<tf.Tensor: shape=(2, 23), dtype=float32, numpy=
array([[0.04632642, 0.01602496, 0.02437647, 0.0917071 , 0.02269847,
        0.05999894, 0.0054694 , 0.09258769, 0.02883663, 0.05912064,
        0.00544467, 0.0186241 , 0.10173854, 0.0309949 , 0.02480053,
        0.0386705 , 0.05996541, 0.07468805, 0.02170018, 0.04128152,
        0.08671828, 0.03273934, 0.01548728],
       [0.02144571, 0.05479778, 0.05604029, 0.0362423 , 0.04444601,
        0.06695263, 0.062202  , 0.0409297 , 0.02308778, 0.07676475,
        0.08955481, 0.02691988, 0.01823046, 0.07420727, 0.0487841 ,
        0.00637457, 0.01444775, 0.04236022, 0.08114415, 0.01141247,
        0.01943649, 0.00413066, 0.08008815]], dtype=float32)>

tensor = tf.random.uniform((2,3),0,1)
result = tf.reduce_sum(tensor, axis=1, keepdims=True)
result

# Construct a Simple 2 Layer ANN model for training

In [31]:
for _ in range(epochs):
    document_topic_distribution = tf.nn.softmax(alpha+tf.matmul(BOW_Features, topic_word_distribution, transpose_b=True))
    topic_word_distribution = tf.nn.softmax(beta + tf.matmul(document_topic_distribution, BOW_Features,transpose_a=True))

In [32]:
BOW_Features.shape

TensorShape([5, 23])

In [33]:
topic_word_distribution.shape

TensorShape([2, 23])

In [34]:
document_topic_distribution.shape

TensorShape([5, 2])

In [36]:
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}
idx2word

{0: 'is',
 1: 'learning',
 2: 'a',
 3: 'machine',
 4: 'of',
 5: 'subset',
 6: 'nlp',
 7: 'artificial',
 8: 'intelligence',
 9: 'tensorflow',
 10: 'popular',
 11: 'framework',
 12: 'topic',
 13: 'modeling',
 14: 'an',
 15: 'important',
 16: 'technique',
 17: 'in',
 18: 'deep',
 19: 'natural',
 20: 'language',
 21: 'processing',
 22: 'part'}

In [37]:
# Print the learned topics
topics = {}
for i, topic_dist in enumerate(topic_word_distribution):
    top_words_idx = tf.argsort(topic_dist, direction='DESCENDING')[:5]
    print(top_words_idx)
    top_words = [idx2word[idx.numpy()] for idx in top_words_idx]
    topics[f"Topic {i + 1}"] = top_words

tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int32)
tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int32)


In [38]:
for topic, words in topics.items():
    print(f"{topic}: {', '.join(words)}")

Topic 1: is, learning, a, machine, of
Topic 2: is, learning, a, machine, of


## Remove stop words and then calculate topic vectors

In [39]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Tokenize the documents and remove stop words
tokenized_docs = []
for doc in documents:
    words = doc.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    tokenized_docs.append(filtered_words)

In [40]:
tokenized_docs

[['machine', 'learning', 'subset', 'artificial', 'intelligence'],
 ['tensorflow', 'popular', 'machine', 'learning', 'framework'],
 ['topic', 'modeling', 'important', 'technique', 'NLP'],
 ['deep', 'learning', 'subset', 'machine', 'learning'],
 ['natural', 'language', 'processing', 'part', 'NLP']]

In [41]:
# Create a vocabulary
vocab = set(word for doc in tokenized_docs for word in doc)
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}
num_words = len(vocab)

In [42]:
# Convert tokenized documents to bag-of-words representation
bow_docs = []
for doc in tokenized_docs:
    bow_doc = [0] * num_words
    for word in doc:
        bow_doc[word2idx[word]] += 1
    bow_docs.append(bow_doc)

# Convert to TensorFlow tensors
bow_docs = tf.constant(bow_docs, dtype=tf.float32)

In [43]:
# Define the LDA model
num_topics = 2
num_iterations = 100
alpha = 1.0  # Hyperparameter for document-topic distribution
beta = 1.0   # Hyperparameter for topic-word distribution

In [44]:
# Initialize topic-word distribution randomly
topic_word_distribution = tf.random.uniform((num_topics, num_words), 0, 1)
topic_word_distribution /= tf.reduce_sum(topic_word_distribution, axis=1, keepdims=True)

In [45]:
for _ in range(num_iterations):
    # E-step: Update document-topic distribution
    doc_topic_distribution = tf.nn.softmax(alpha + tf.matmul(bow_docs, topic_word_distribution, transpose_b=True))

    # M-step: Update topic-word distribution
    topic_word_distribution = tf.nn.softmax(beta + tf.matmul(doc_topic_distribution, bow_docs,transpose_a=True))

In [47]:
# Print the learned topics
topics = {}
for i, topic_dist in enumerate(topic_word_distribution):
    top_words_idx = tf.argsort(topic_dist, direction='DESCENDING')[:5]
    top_words = [idx2word[idx.numpy()] for idx in top_words_idx]  # Convert to NumPy array
    topics[f"Topic {i + 1}"] = top_words

for topic, words in topics.items():
    print(f"{topic}: {', '.join(words)}")

Topic 1: learning, machine, subset, NLP, language
Topic 2: learning, machine, subset, NLP, language


In [48]:
# Calculate topic vectors for each document
document_topic_vectors = tf.matmul(doc_topic_distribution,bow_docs,transpose_a=True)
print("Document-Topic Vectors:")
print(document_topic_vectors.numpy())

Document-Topic Vectors:
[[2.  0.5 1.5 1.  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 1.  0.5 0.5 0.5 0.5 0.5]
 [2.  0.5 1.5 1.  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 1.  0.5 0.5 0.5 0.5 0.5]]


# Task_Today: Use TFIDF BOW and compute the topics given in the documents.