In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import nltk

In [2]:
dataset = ["Thank you all so very much. Thank you to the Academy.", 
"Thank you to all of you in this room. I have to congratulate the other incredible nominees this year.", 
"The Revenant was the product of the tireless efforts of an unbelievable cast and crew.", 
"First off, to my brother in this endeavor, Mr. Tom Hardy. Tom, your talent on screen can only be surpassed by your friendship off screen … " ,
"thank you for creating a transcendent cinematic experience. Thank you to everybody at Fox and New Regency … my entire team. "
]

In [3]:
dataset = [line.lower() for line in dataset]

In [4]:
dataset

['thank you all so very much. thank you to the academy.',
 'thank you to all of you in this room. i have to congratulate the other incredible nominees this year.',
 'the revenant was the product of the tireless efforts of an unbelievable cast and crew.',
 'first off, to my brother in this endeavor, mr. tom hardy. tom, your talent on screen can only be surpassed by your friendship off screen … ',
 'thank you for creating a transcendent cinematic experience. thank you to everybody at fox and new regency … my entire team. ']

In [5]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)

In [6]:
print(X)

  (0, 0)	0.333241902679636
  (0, 47)	0.22317598600980787
  (0, 50)	0.18774267514425896
  (0, 28)	0.333241902679636
  (0, 54)	0.333241902679636
  (0, 42)	0.333241902679636
  (0, 1)	0.26885746882028694
  (0, 57)	0.44635197201961574
  (0, 46)	0.44635197201961574
  (1, 56)	0.2538603059586142
  (1, 31)	0.2538603059586142
  (1, 26)	0.2538603059586142
  (1, 36)	0.2538603059586142
  (1, 11)	0.2538603059586142
  (1, 24)	0.2538603059586142
  (1, 40)	0.2538603059586142
  (1, 48)	0.4096257928258878
  (1, 25)	0.2048128964129439
  (1, 32)	0.2048128964129439
  (1, 47)	0.1700132055287517
  (1, 50)	0.28604093645107354
  (1, 1)	0.2048128964129439
  (1, 57)	0.3400264110575034
  (1, 46)	0.1700132055287517
  (2, 13)	0.24775556199678603
  :	:
  (3, 15)	0.17882779871384327
  (3, 6)	0.17882779871384327
  (3, 29)	0.14427714201094594
  (3, 33)	0.35765559742768654
  (3, 19)	0.17882779871384327
  (3, 48)	0.14427714201094594
  (3, 25)	0.14427714201094594
  (3, 50)	0.10074846245543193
  (4, 45)	0.24106996728750385


In [7]:
lsa = TruncatedSVD(n_components= 4, n_iter=100)

In [8]:
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=4, n_iter=100,
             random_state=None, tol=0.0)

In [10]:
row1 = lsa.components_[0]
print(row1)

[0.15308938 0.21406525 0.05157419 0.11353076 0.08914427 0.02036598
 0.02036598 0.02036598 0.02036598 0.05157419 0.08914427 0.11223898
 0.08914427 0.05157419 0.05157419 0.02036598 0.08914427 0.08914427
 0.08914427 0.02036598 0.08914427 0.08914427 0.02036598 0.02036598
 0.11223898 0.10698484 0.11223898 0.02036598 0.15308938 0.08835218
 0.08914427 0.11223898 0.17377315 0.04073196 0.02036598 0.02036598
 0.11223898 0.05157419 0.08914427 0.05157419 0.11223898 0.04073196
 0.15308938 0.02036598 0.02036598 0.08914427 0.39962114 0.28131299
 0.19753854 0.05157419 0.27441101 0.04073196 0.08914427 0.05157419
 0.15308938 0.05157419 0.11223898 0.47478889 0.04073196]


In [11]:
terms = vectorizer.get_feature_names()

In [14]:
concept_words = {}

In [15]:
for i, comp in enumerate(lsa.components_):
  componentTerms = zip(terms, comp)
  sortedTerms = sorted(componentTerms, key=lambda x:x[1], reverse=True)
  sortedTerms = sortedTerms[:10]
  concept_words["concept"+str(i)] = sortedTerms

In [16]:
concept_words

{'concept0': [('you', 0.4747888904053926),
  ('thank', 0.39962114028483614),
  ('the', 0.2813129900159892),
  ('to', 0.2744110112374121),
  ('all', 0.2140652459428236),
  ('this', 0.1975385436203182),
  ('of', 0.17377315453996386),
  ('academy', 0.15308937901854794),
  ('much', 0.1530893790185479),
  ('so', 0.1530893790185479)],
 'concept1': [('off', 0.3266335808913956),
  ('screen', 0.3266335808913956),
  ('tom', 0.3266335808913956),
  ('your', 0.3266335808913956),
  ('be', 0.1633167904456978),
  ('brother', 0.1633167904456978),
  ('by', 0.1633167904456978),
  ('can', 0.1633167904456978),
  ('endeavor', 0.1633167904456978),
  ('first', 0.1633167904456978)],
 'concept2': [('the', 0.3809344472531115),
  ('of', 0.3528915105932839),
  ('cast', 0.19986786970925616),
  ('crew', 0.19986786970925616),
  ('efforts', 0.19986786970925616),
  ('product', 0.19986786970925616),
  ('revenant', 0.19986786970925616),
  ('tireless', 0.19986786970925616),
  ('unbelievable', 0.19986786970925616),
  ('was

In [19]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [20]:
for key in concept_words.keys():
  sentence_scores = []
  for sentence in dataset:
    words = nltk.word_tokenize(sentence)
    score = 0
    for word in words:
      for word_with_score in concept_words[key]:
        if word == word_with_score[0]:
          score += word_with_score[1]
    sentence_scores.append(score)
  print("\n"+key+":")
  for sentence_scores in sentence_scores:
    print(sentence_scores)


concept0:
2.9778774456323265
2.962249421309859
1.1914852791278954
0.47194955485773027
2.0232310726178695

concept1:
0
0
0
3.5929693898053516
0

concept2:
0.3809344472531115
0.7338259578463954
3.4475293206199504
0
0

concept3:
0
0
0.24071103310233782
0
1.9616988826787807
