### LATENT SEMANTIC ANALYSIS

In [1]:
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = ['rec.sport.baseball']
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
corpus = dataset.data

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\somya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
corpus = [x.lower() for x in corpus]

In [5]:
corpus[0]

u"from: writingctr@leo.bsuvc.bsu.edu\nsubject: re: cub fever.\norganization: ball state university, muncie, in - univ. computing svc's\nlines: 21\n\n\nin article <kingoz.735285670@camelot>, kingoz@camelot.bradley.edu (orin roth) writes:\n> \n>    cub fever is hitting me again. i'm beginning to think they have a \n>    chance this year. (what the heck am i thinking?)\n>    sorry. just a moment of incompetence.\n>    i'll be ok. really. \n>    orin.\n>    bradley u.\n> \n> --\n> i'm really a jester in disguise!                                   \ni hear ya!  then again, we must remember that we are indeed cub fans, and\nthat the cubs will eventually blow it.  after all, the cubs are the easiest\nteam in the national league to root for.  no pressure.  you know they will\nlose eventually.  oh well, i suppose we must have faith.  after all, they\ndo look pretty good, and they don't even have sandberg back yet.  \n\ncubs in '93!!!!!\n\ncha\n"

In [63]:
stopset = set(stopwords.words('english'))

stopset.update(['\n','<','>','\t','nntp','much','might','howard johnson','mike fester', 'robert holt',
                'joe chiple', 'erik roush', 'mary cole', '~q', 'chuck clein', 'vaughn', 'chuq von rospach',
                'roger maynard', 'since', 'imho', 'looking', 'neal traven', 'joe carter', 
               'roger lusting', 'doug ritter', 'keith keller', 'danny smith', 'michael lurie', 'bruce kleinman',
               'richard', 'vaughn', 'jim clouse', 'orin roth', 'alan sepinwall', 'mike jones', 'charles m kozierok',
               'ted frank', 'mark davis', 'greg spira', 'david rex wood', 'hody hagins', 'net', 'little',
               'scott barman', 'doug roberts', 'help', 'douglas fowler', 'gary wieman', 'imho', 'right', 'enough', 
               'anyone', 'info', 'without', 'also', 'still', 'suppose', 'etc', 'believe', 'fact','thanks', 'made', 
               'called', 'really', 'seen', 'even', 'seems', 'maybe','better', 'think', 'like', '00 00', 
               'cs', 'com', 'us', 'edu', 'know', 'would', 'go', '00 00 00', 'anything', 'take', 'al', 'ca',
               '00', '000', '000 000', '000 000 000', 'may', 'way', 'get',])

In [64]:
vectorizer = TfidfVectorizer(stop_words=stopset,
                                 use_idf=True, ngram_range=(1, 3))
X = vectorizer.fit_transform(corpus)

In [65]:
X[0]

<1x179540 sparse matrix of type '<type 'numpy.float64'>'
	with 210 stored elements in Compressed Sparse Row format>

In [66]:
print X[0]

  (0, 49995)	0.0765116734718
  (0, 178804)	0.0765116734718
  (0, 28722)	0.0765116734718
  (0, 138738)	0.0765116734718
  (0, 73877)	0.0765116734718
  (0, 126521)	0.0765116734718
  (0, 97558)	0.0765116734718
  (0, 63038)	0.0765116734718
  (0, 107423)	0.0765116734718
  (0, 171525)	0.0765116734718
  (0, 112812)	0.0765116734718
  (0, 60905)	0.0765116734718
  (0, 97989)	0.0765116734718
  (0, 126416)	0.0765116734718
  (0, 135886)	0.0765116734718
  (0, 93141)	0.0765116734718
  (0, 108065)	0.0765116734718
  (0, 157213)	0.0765116734718
  (0, 57960)	0.0765116734718
  (0, 50035)	0.0765116734718
  (0, 34823)	0.0765116734718
  (0, 60891)	0.0765116734718
  (0, 50041)	0.0765116734718
  (0, 63507)	0.0765116734718
  (0, 49936)	0.0765116734718
  :	:
  (0, 175288)	0.01619480883
  (0, 136133)	0.0574966657126
  (0, 115506)	0.114993331425
  (0, 36638)	0.103115511068
  (0, 39612)	0.108887292156
  (0, 17089)	0.0765116734718
  (0, 90385)	0.114993331425
  (0, 25517)	0.0167505629187
  (0, 7712)	0.0390882503567
  

In [67]:
X.shape

(994, 179540)

In [68]:
lsa = TruncatedSVD(n_components=27, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=27, n_iter=100,
       random_state=None, tol=0.0)

In [69]:
lsa.components_[0]

array([ 0.00061338,  0.00061338,  0.00061338, ...,  0.00108248,
        0.00108248,  0.00108248])

In [70]:
import sys
print (sys.version)

2.7.12 |Anaconda 4.1.1 (64-bit)| (default, Jun 29 2016, 11:07:13) [MSC v.1500 64 bit (AMD64)]


In [71]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print "Concept %d:" % i
    for term in sortedTerms:
        print term[0]
    print " "

Concept 0:
year
team
game
writes
article
baseball
players
games
one
good
 
Concept 1:
jewish
000th
aix
season
red
article
jewish baseball
braves
jewish baseball players
university
 
Concept 2:
clutch
hit
ball
come
posting host
netcom
fans
runs
performance
hitting
 
Concept 3:
team
player
morris
article
posting
win
say
posting host
netcom
dept
 
Concept 4:
good
001
bonds
writes
article
david
hall
games
fans
clutch
 
Concept 5:
000th
probably
gant
reply
great
organization
hall
game
young
university
 
Concept 6:
win
001
league
year
games
player
ever
make
good
15
 
Concept 7:
game
host
one
hitting
back
lines
games
university
0000ahc udcps3 cps
distribution
 
Concept 8:
first
000th career
hit
pitching
bad
posting
end
game
people
one
 
Concept 9:
article
game
morris
000th
years
make
team
last
games
teams
 
Concept 10:
posting host
years
player
university
time
players
year
uiuc
ball
home
 
Concept 11:
games
one
baseball
make
pitching
university
best
scott
000th career
back
 
Concept 12:
games

In [34]:
lsa.components_

array([[  1.95890025e-02,   6.19052039e-03,   1.02201609e-03, ...,
          1.05325239e-03,   1.05325239e-03,   1.05325239e-03],
       [ -1.46316224e-02,  -8.31854042e-04,  -6.49764966e-02, ...,
         -5.06942833e-04,  -5.06942833e-04,  -5.06942833e-04],
       [ -1.38644634e-01,  -6.95182950e-02,  -8.04220795e-02, ...,
         -7.00140727e-05,  -7.00140727e-05,  -7.00140727e-05],
       ..., 
       [  5.71951469e-02,   3.23725098e-02,  -4.70464415e-02, ...,
          8.00447200e-04,   8.00447200e-04,   8.00447200e-04],
       [ -6.81026606e-02,  -2.29743945e-02,   1.48614989e-01, ...,
         -4.60159379e-04,  -4.60159379e-04,  -4.60159379e-04],
       [ -6.67407855e-02,  -5.46814025e-02,   1.54714905e-01, ...,
         -1.48591247e-03,  -1.48591247e-03,  -1.48591247e-03]])

Analzing the above data we can interpret that they are taking about baseball game.