## Coding Exercise #0512

In [None]:
import numpy as np
import warnings
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition  import TruncatedSVD
warnings.filterwarnings('ignore')

### 1. Latent Semantic Analysis (LSA):

In [None]:
# The data.
my_docs = ["The economic slowdown is becoming more severe",
           "The movie was simply awesome",
           "I like cooking my own food",
           "Samsung is announcing a new technology",
           "Machine Learning is an example of awesome technology",
           "All of us were excited at the movie",
           "We have to do more to reverse the economic slowdown"]

#### 1.1. Create a TF IDF representation:
TfidfVectorizer() arguments: <br>
- *max_features* : maximum number of features (distict words). <br>
- *min_df* : The minimum DF. Integer value means count and real number (0~1) means proportion. <br> 
- *max_df* : The maximum DF. Integer value means count and real number (0~1) means proportion. Helps to filter out the stop words. <br> 

In [None]:
my_docs = [x.lower() for x in my_docs]

In [None]:
my_stop_words = ['us', 'like']

In [None]:
vectorizer = TfidfVectorizer(max_features = 15, min_df = 1, max_df = 3, stop_words = stopwords.words('english') + my_stop_words)
X = vectorizer.fit_transform(my_docs).toarray()              

In [None]:
# Size of X (=m x n). m = number of documents = 7 & n = number of features.
X.shape

In [None]:
# View the features.
features = vectorizer.get_feature_names()
print(features)

#### 1.2. Apply the truncated SVD:

In [None]:
n_topics = 4
svd = TruncatedSVD(n_components=n_topics, n_iter=100)
svd.fit(X)

In [None]:
# get the V^t matrix. 
vt = svd.components_
vtabs = np.abs(vt)

In [None]:
# Check for the size of V^t. 
vt.shape

#### 1.3. From each topic, extract the top features:

In [None]:
n_top = 3
for i in range(n_topics):
    topic_features = [features[idx] for idx in np.argsort(-vtabs[i,:])]   # argsort() shows the sorted index.
    topic_features_top = topic_features[0:n_top]
    if i == 0:
        topic_matrix = [topic_features_top]                    # list의 list 만들 준비!
    else:
        topic_matrix.append(topic_features_top) 

In [None]:
# Show the top features for each topic.
topic_matrix

In [None]:
# In view of the top features, we can name the topics.
topic_names = ['Economy', 'Movie','Technology', 'Cuisine']

#### 1.4. Label each document with the most predominant topic:

In [None]:
n_docs = len(my_docs)
for i in range(n_docs):
    score_pick = 0
    topic_pick = 0
    tokennized_doc = nltk.word_tokenize(my_docs[i])
    for j in range(n_topics):
        found = [ x in topic_matrix[j] for x in tokennized_doc ] 
        score = np.sum(found)
        if (score > score_pick):
            score_pick = score
            topic_pick = j
    print("Document " + str(i+1) + " = " + topic_names[topic_pick])

**NOTE**: We can notice some inaccuracies.