## Coding Exercise #0713

In [1]:
import numpy as np
import warnings
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition  import LatentDirichletAllocation
warnings.filterwarnings('ignore')

#### 1. Latent Dirichlet Allocation (LDA):

In [2]:
# The data.
my_docs = ["The economic slowdown is becoming more severe",
           "The movie was simply awesome",
           "I like cooking my own food",
           "Samsung is announcing a new technology",
           "Machine Learning is an example of awesome technology",
           "All of us were excited at the movie",
           "We have to do more to reverse the economic slowdown"]

In [3]:
my_docs = [x.lower() for x in my_docs]

#### 1.1. Create a DTM representation:
CountVectorizer() arguments: <br>
- *max_features* : maximum number of features (distict words). <br>
- *min_df* : The minimum DF. Integer value means count and real number (0~1) means proportion. <br> 
- *max_df* : The maximum DF. Integer value means count and real number (0~1) means proportion. Helps to filter out the stop words. <br> 

In [4]:
vectorizer = CountVectorizer(max_features = 15, min_df = 1, max_df = 3, stop_words = ENGLISH_STOP_WORDS)
X = vectorizer.fit_transform(my_docs).toarray()               

In [5]:
# Show th DTM. 
X

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1],
       [0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0]], dtype=int64)

In [6]:
# Size of X (=m x n). m = number of documents = 7 & n = number of features.
X.shape

(7, 15)

In [7]:
# View the features.
features = vectorizer.get_feature_names()
print(features)

['announcing', 'awesome', 'economic', 'example', 'excited', 'food', 'learning', 'movie', 'new', 'reverse', 'samsung', 'severe', 'simply', 'slowdown', 'technology']


#### 1.2. Apply the LDA: 

In [8]:
# Get the topics.
n_topics = 4
lda = LatentDirichletAllocation(n_components=n_topics, random_state=123)
my_docs_topic = lda.fit_transform(X)                               

In [9]:
# row = document, column = topic.
my_docs_topic

array([[0.80515032, 0.06273478, 0.06255696, 0.06955794],
       [0.06455228, 0.06274657, 0.81007434, 0.06262681],
       [0.12522103, 0.12571681, 0.62369815, 0.12536402],
       [0.05117543, 0.05023664, 0.84846627, 0.05012166],
       [0.84833626, 0.05018186, 0.05138822, 0.05009367],
       [0.08342836, 0.08363634, 0.74944622, 0.08348908],
       [0.06434067, 0.06264053, 0.06253428, 0.81048452]])

In [10]:
# Sum along the row has to give 1.
my_docs_topic.sum(axis=1)

array([1., 1., 1., 1., 1., 1., 1.])

#### 1.3. From each topic, extract the top features:

In [11]:
topic_composition = lda.components_
topic_composition.shape     # row = topic, column = feature (word).

(4, 15)

In [12]:
n_top = 3
for i in range(n_topics):
    topic_features = [features[idx] for idx in np.argsort(-topic_composition[i,:])]   # argsort() shows the sorted index.
    topic_features_top = topic_features[0:n_top]
    if i == 0:
        topic_matrix = [topic_features_top]                    # list의 list 만들 준비!
    else:
        topic_matrix.append(topic_features_top) 

In [13]:
# Show the top features for each topic.
topic_matrix

[['awesome', 'technology', 'example'],
 ['food', 'awesome', 'excited'],
 ['movie', 'announcing', 'new'],
 ['economic', 'slowdown', 'reverse']]

In [14]:
# In view of the top features, we can name the topics.
topic_names = ['Technology', 'Cuisine', 'Movie','Economy']

#### 1.4. Label each document with the most predominant topic:

In [15]:
# The most probable topic is given directly by the LDA output.
n_docs = len(my_docs)
for i in range(n_docs):
    topic_pick = np.argmax(my_docs_topic[i,:])
    print("Document " + str(i+1) + " = " + topic_names[topic_pick])

Document 1 = Technology
Document 2 = Movie
Document 3 = Movie
Document 4 = Movie
Document 5 = Technology
Document 6 = Movie
Document 7 = Economy


**NOTE**: We can notice some inaccuracies.