In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = ['rec.sport.hockey']
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
corpus = dataset.data

In [2]:
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/shiva/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
stopset = set(stopwords.words('english'))
stopset.update(['lt','p','/p','br','amp','quot','field','font','normal','span','0px','rgb','style','51', 
                'spacing','text','helvetica','size','family', 'space', 'arial', 'height', 'indent', 'letter'
                'line','none','sans','serif','transform','line','variant','weight','times', 'new','strong', 'video', 'title'
                'white','word','letter', 'roman','0pt','16','color','12','14','21', 'neue', 'apple', 'class',  ])

In [5]:
vectorizer = TfidfVectorizer(stop_words=stopset,
                                 use_idf=True, ngram_range=(1, 3))
X = vectorizer.fit_transform(corpus)

In [6]:
print X[0]

  (0, 187228)	0.0470271881243
  (0, 148638)	0.0470271881243
  (0, 141244)	0.0470271881243
  (0, 58616)	0.0470271881243
  (0, 130682)	0.0470271881243
  (0, 40738)	0.0470271881243
  (0, 71215)	0.0470271881243
  (0, 208306)	0.0676214466887
  (0, 198314)	0.0676214466887
  (0, 176562)	0.063821082647
  (0, 137113)	0.063821082647
  (0, 86866)	0.063821082647
  (0, 118186)	0.063821082647
  (0, 114530)	0.063821082647
  (0, 66300)	0.063821082647
  (0, 98587)	0.063821082647
  (0, 165592)	0.063821082647
  (0, 196892)	0.063821082647
  (0, 234211)	0.063821082647
  (0, 230198)	0.063821082647
  (0, 131588)	0.063821082647
  (0, 176542)	0.063821082647
  (0, 80573)	0.063821082647
  (0, 106992)	0.063821082647
  (0, 202875)	0.063821082647
  :	:
  (0, 134955)	0.00942925748947
  (0, 209808)	0.0428859555728
  (0, 225379)	0.0148745925378
  (0, 192521)	0.033404370962
  (0, 69674)	0.0329296948005
  (0, 77890)	0.0374514032131
  (0, 160400)	0.00944813534277
  (0, 131241)	0.0850573112225
  (0, 141383)	0.074902806426

In [7]:
lsa = TruncatedSVD(n_components=27, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=27, n_iter=100,
       random_state=None, tol=0.0)

In [8]:
lsa.components_[0]

array([  1.80462932e-02,   1.08023932e-03,   3.22069303e-04, ...,
         4.90849632e-05,   4.90849632e-05,   4.90849632e-05])

In [9]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print "Concept %d:" % i
    for term in sortedTerms:
        print term[0]
    print " "

Concept 0:
edu
ca
game
hockey
team
go
re
would
gld
espn
 
Concept 1:
win
subject re
blues
15
right
could
article
cmu edu
game
18
 
Concept 2:
subject
article
would
ca
like
back
good
could
laurentian
utoronto
 
Concept 3:
hockey
edu
com
espn
year
ahl
see
clarkson
night
boston
 
Concept 4:
edu
cmu
first
go
buffalo
see
andrew
really
cmu edu
right
 
Concept 5:
edu
time
playoffs
win
back
organization
didn
lines
one
sas upenn edu
 
Concept 6:
hockey
good
go
rangers
year
nhl
writes
roger
would
get
 
Concept 7:
com
win
game
good
know
baseball
also
much
pens
00
 
Concept 8:
play
re
espn
com
rangers
game
gm
think
goal
18
 
Concept 9:
year
andrew
player
period
back
teams
cup
re
com
john
 
Concept 10:
edu
writes
one
last
team
like
games
nhl
gm
cmu
 
Concept 11:
game
many
back
see
edu
re
subject re
think
league
lines
 
Concept 12:
game
edu
won
teams
also
pens
subject
st
would
division
 
Concept 13:
game
would
hockey
host
last
like
great
espn
first
detroit
 
Concept 14:
go
would
host
get
cup
see
11
