In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = ['rec.sport.hockey']
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
corpus = dataset.data

In [2]:
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [3]:
#run this only once
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CG\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
corpus = [x.lower() for x in corpus]

In [6]:
stopset = set(stopwords.words('english'))
stopset.update(['from', '@', 'edu', 'com','<', '>', '|', '^', '/', '*', 'Subset', 'Re', 'organization', 'Lines', 'distribution', 
               'html', 'head', 'pre', 'html', 'body', ':'])

In [7]:
vectorizer = TfidfVectorizer(stop_words=stopset,
                                 use_idf=True, ngram_range=(1, 3))
X = vectorizer.fit_transform(corpus)

In [8]:
X[0]

<1x242538 sparse matrix of type '<class 'numpy.float64'>'
	with 283 stored elements in Compressed Sparse Row format>

In [45]:
#After
print(X[0])

  (0, 52243)	0.074178608241
  (0, 188775)	0.074178608241
  (0, 29835)	0.074178608241
  (0, 145472)	0.074178608241
  (0, 63515)	0.074178608241
  (0, 77346)	0.074178608241
  (0, 132621)	0.074178608241
  (0, 102781)	0.074178608241
  (0, 65903)	0.074178608241
  (0, 114041)	0.074178608241
  (0, 162532)	0.074178608241
  (0, 180691)	0.074178608241
  (0, 119514)	0.074178608241
  (0, 63653)	0.074178608241
  (0, 103277)	0.074178608241
  (0, 94983)	0.074178608241
  (0, 132536)	0.074178608241
  (0, 142607)	0.074178608241
  (0, 97610)	0.074178608241
  (0, 114702)	0.074178608241
  (0, 164951)	0.074178608241
  (0, 60136)	0.074178608241
  (0, 52283)	0.074178608241
  (0, 36694)	0.074178608241
  (0, 63639)	0.074178608241
  :	:
  (0, 83979)	0.034564931111
  (0, 185290)	0.015700981631
  (0, 142856)	0.055743423814
  (0, 121663)	0.111486847628
  (0, 38485)	0.0999712168349
  (0, 41739)	0.105566999397
  (0, 17339)	0.074178608241
  (0, 94369)	0.111486847628
  (0, 26629)	0.0162397891482
  (0, 7940)	0.0378963350

In [9]:
X.shape

(999, 242538)

In [10]:
lsa = TruncatedSVD(n_components=27, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=27, n_iter=100,
       random_state=None, tol=0.0)

In [11]:
#This is the first row for V
lsa.components_[0]

array([  2.09345024e-02,   1.19274379e-03,   3.60917570e-04, ...,
         5.55831391e-05,   5.55831391e-05,   5.55831391e-05])

In [12]:
import sys
print (sys.version)

3.5.2 |Anaconda 4.2.0 (64-bit)| (default, Jul  5 2016, 11:41:13) [MSC v.1900 64 bit (AMD64)]


In [13]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print("Concept %d:" % i )
    for term in sortedTerms:
        print(term[0])
    print (" ")

Concept 0:
ca
game
hockey
team
go
would
writes
play
espn
games
 
Concept 1:
gld
10
cc columbia
cunixb
cunixb cc
cunixb cc columbia
gary dare
gary
dare
espn
 
Concept 2:
one
10
11
game
12
cup
hockey
series
season
00
 
Concept 3:
article
10
would
bos
time
cs laurentian
cs laurentian ca
ramsey cs laurentian
cs
rangers
 
Concept 4:
subject
division
teams
00 00 pm
ahl
st
since
hockey
would
gm
 
Concept 5:
go
buffalo
game
still
one
mike
00 00 01
second
bruins
cs
 
Concept 6:
year
teams
first
university
00 00 monday
way
playing
00 000
gld
coach
 
Concept 7:
game
one
games
buffalo
season
17
play
cup
playoff
caps
 
Concept 8:
team
nhl
league
even
teams
players
said
pick
really
think
 
Concept 9:
think
games
article
like
players
league
detroit
rangers
time
hockey
 
Concept 10:
00 00 00
posting
ca
wings
00
next
leafs
15
player
time
 
Concept 11:
year
hockey
see
play
one
posting
world
game
even
games
 
Concept 12:
hockey
go
game
win
made
way
say
20
00 000
19
 
Concept 13:
one
year
lines
leafs
vs
c

In [14]:
lsa.components_

array([[  2.09345024e-02,   1.19274379e-03,   3.60917570e-04, ...,
          5.55831391e-05,   5.55831391e-05,   5.55831391e-05],
       [  2.03672573e-02,   2.03124752e-02,   3.63635638e-02, ...,
         -4.22039299e-05,  -4.22039299e-05,  -4.22039299e-05],
       [  8.71522764e-02,   3.52282071e-02,   4.65344781e-02, ...,
         -1.02506252e-04,  -1.02506252e-04,  -1.02506252e-04],
       ..., 
       [  1.45382209e-01,   5.82784346e-03,   1.20656246e-01, ...,
          1.71780159e-05,   1.71780159e-05,   1.71780159e-05],
       [  2.06060936e-01,   3.16964314e-03,   8.48939008e-02, ...,
          4.96889746e-05,   4.96889746e-05,   4.96889746e-05],
       [ -6.55607319e-02,   2.76973309e-02,  -1.66538013e-01, ...,
         -8.66973436e-05,  -8.66973436e-05,  -8.66973436e-05]])