# Latent Semantic Analysis

# 1.1

In [25]:
# Importing the Libraries

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
# Sample Data
dataset = ["The amount of polution is increasing day by day",
           "The concert was just great",
           "I love to see Gordon Ramsay cook",
           "Google is introducing a new technology",
           "AI Robots are examples of great technology present today",
           "All of us were singing in the concert",
           "We have launch campaigns to stop pollution and global warming"]


In [3]:
dataset = [line.lower() for line in dataset]

In [4]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)

In [5]:
# TF - IDF model form
print(X[0])

  (0, 34)	0.22786438777524437
  (0, 2)	0.3211483974289088
  (0, 24)	0.22786438777524437
  (0, 26)	0.3211483974289088
  (0, 19)	0.2665807498646048
  (0, 17)	0.3211483974289088
  (0, 9)	0.6422967948578177
  (0, 5)	0.3211483974289088


In [7]:
X.shape

(7, 42)

In [10]:
# Decomposing the dataset into Singular values

lsa = TruncatedSVD(n_components = 4, 
                   n_iter = 100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=4, n_iter=100,
       random_state=None, tol=0.0)

In [12]:
#returning the first row of V.T matrix
row1 = lsa.components_[0]
row1

array([ 1.24191973e-01,  1.78240252e-01,  1.14460798e-01, -1.19745007e-16,
        1.24191973e-01,  1.14460798e-01, -1.19745007e-16,  3.44988739e-01,
       -1.62865561e-16,  2.28921595e-01,  1.24191973e-01, -1.19745007e-16,
        9.72770950e-02, -1.62865561e-16,  3.00124026e-01, -1.19745007e-16,
        1.78240252e-01,  1.14460798e-01,  9.72770950e-02,  1.75760635e-01,
        2.37365829e-01, -1.19745007e-16, -1.62865561e-16,  9.72770950e-02,
        2.95798061e-01, -1.19745007e-16,  1.14460798e-01,  1.24191973e-01,
       -1.62865561e-16,  1.24191973e-01, -1.62865561e-16,  1.78240252e-01,
       -1.19745007e-16,  1.83838346e-01,  3.76098295e-01, -2.34591042e-16,
        1.24191973e-01,  1.78240252e-01, -1.19745007e-16,  2.37365829e-01,
       -1.19745007e-16,  1.78240252e-01])

In [21]:
terms = vectorizer.get_feature_names()
for i,comp in enumerate(lsa.components_):
    componentTerms = zip(terms, comp)
    sortedTerms = sorted(componentTerms,
                         key=lambda x:x[1],
                         reverse=True)
    sortedTerms = sortedTerms[:10]
    print('\nConcept',i,':')
    for term in sortedTerms:
        print(term)


Concept 0 :
('the', 0.376098295292637)
('concert', 0.3449887392330656)
('great', 0.30012402589487386)
('of', 0.29579806095266675)
('just', 0.23736582929791195)
('was', 0.23736582929791195)
('day', 0.22892159541504475)
('technology', 0.18383834567413462)
('all', 0.17824025175628946)
('in', 0.17824025175628946)

Concept 1 :
('to', 0.4157884439670068)
('cook', 0.28359165793510677)
('gordon', 0.28359165793510677)
('love', 0.28359165793510677)
('ramsay', 0.28359165793510677)
('see', 0.28359165793510677)
('and', 0.2173064471129251)
('campaigns', 0.2173064471129251)
('global', 0.2173064471129251)
('have', 0.2173064471129251)

Concept 2 :
('technology', 0.37791806767143865)
('is', 0.34196143806319995)
('google', 0.3413969441909749)
('introducing', 0.3413969441909749)
('new', 0.3413969441909749)
('day', 0.14112432680995018)
('are', 0.11387892195372844)
('examples', 0.11387892195372844)
('present', 0.11387892195372844)
('robots', 0.11387892195372844)

Concept 3 :
('day', 0.4654267679041113)
('a

# 1.2

In [20]:
concept_words = {}

In [22]:
terms = vectorizer.get_feature_names()

for i,comp in enumerate(lsa.components_):
    
    componentTerms = zip(terms, comp)
    
    sortedTerms = sorted(componentTerms,
                         key=lambda x:x[1],
                         reverse=True)
    
    sortedTerms = sortedTerms[:10]
    concept_words["Concept"+str(i)] = sortedTerms

In [23]:
concept_words

{'Concept0': [('the', 0.376098295292637),
  ('concert', 0.3449887392330656),
  ('great', 0.30012402589487386),
  ('of', 0.29579806095266675),
  ('just', 0.23736582929791195),
  ('was', 0.23736582929791195),
  ('day', 0.22892159541504475),
  ('technology', 0.18383834567413462),
  ('all', 0.17824025175628946),
  ('in', 0.17824025175628946)],
 'Concept1': [('to', 0.4157884439670068),
  ('cook', 0.28359165793510677),
  ('gordon', 0.28359165793510677),
  ('love', 0.28359165793510677),
  ('ramsay', 0.28359165793510677),
  ('see', 0.28359165793510677),
  ('and', 0.2173064471129251),
  ('campaigns', 0.2173064471129251),
  ('global', 0.2173064471129251),
  ('have', 0.2173064471129251)],
 'Concept2': [('technology', 0.37791806767143865),
  ('is', 0.34196143806319995),
  ('google', 0.3413969441909749),
  ('introducing', 0.3413969441909749),
  ('new', 0.3413969441909749),
  ('day', 0.14112432680995018),
  ('are', 0.11387892195372844),
  ('examples', 0.11387892195372844),
  ('present', 0.1138789219

In [26]:
# Sentence Concepts
for key in concept_words.keys():
    sentence_scores = []
    for sentence in dataset:
        words = nltk.word_tokenize(sentence)
        score = 0
        for word in words:
            for word_with_score in concept_words[key]:
                if word == word_with_score[0]:
                    score += word_with_score[1]
        sentence_scores.append(score)
    print("\n"+key+":")
    for sentence_score in sentence_scores:
        print(sentence_score)


Concept0:
1.1297395470753933
1.4959427190164003
0
0.18383834567413462
0.7797604325216752
1.3733655989909481
0

Concept1:
0
0
1.8337467336425406
0
0
0
1.2850142324187073

Concept2:
0.6242100916831004
0
0
1.7440703383075633
0.8334337554863523
0
0

Concept3:
2.201593755447888
0.12724213180694488
0
0.21264455202449803
0
0.29658207438874296
0
