In [6]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.datasets import fetch_20newsgroups

In [18]:
dataset=fetch_20newsgroups(subset='all',categories=['sci.space','rec.sport.baseball'],
                          remove=('headers','footers','quotes'))

In [22]:
doc=dataset.data

In [26]:
vectorizer=TfidfVectorizer(max_df=0.5,max_features=1000,stop_words='english')
x=vectorizer.fit_transform(doc)

In [32]:
lsa=TruncatedSVD(n_components=100,random_state=42)
x_lsa=lsa.fit_transform(x)

In [34]:
x_lsa_normalized=Normalizer(copy=False).fit_transform(x_lsa)

In [46]:
terms=vectorizer.get_feature_names_out()

In [48]:
for i,comp in enumerate(lsa.components_[:5]):
    terms_in_component=np.argsort(comp)[::-1][:10]
    print(f'Topic{i}:')
    print(','.join(terms[term] for term in terms_in_component))

Topic0:
space,year,don,like,think,just,time,game,know,good
Topic1:
space,nasa,shuttle,launch,orbit,program,moon,earth,mission,satellite
Topic2:
space,team,game,games,runs,shuttle,year,win,nasa,pitching
Topic3:
thanks,list,baseball,edu,mail,mailing,space,know,games,just
Topic4:
year,space,good,think,better,years,league,players,career,average


In [50]:
explained_variance = lsa.explained_variance_ratio_.sum()
print(f"\nTotal explained variance by the first 100 components: {explained_variance:.2f}")


Total explained variance by the first 100 components: 0.36


In [54]:
lsa_matrix=lsa.fit_transform(x)

In [56]:
lsa_matrix

array([[ 0.37030025, -0.16242244, -0.08195824, ..., -0.02524   ,
        -0.00638388, -0.03246328],
       [ 0.09072127, -0.03903872, -0.02876184, ...,  0.0634347 ,
        -0.0728662 ,  0.03158739],
       [ 0.29049162, -0.13237961,  0.13677135, ..., -0.0122832 ,
         0.02511141, -0.06693441],
       ...,
       [ 0.09607232,  0.02213204, -0.03813851, ...,  0.01057824,
         0.00433295,  0.03088208],
       [ 0.08697255, -0.06783112,  0.00176349, ...,  0.01028695,
         0.01110754, -0.00518875],
       [ 0.27855426,  0.20460254, -0.0627861 , ..., -0.02298982,
        -0.0087584 ,  0.04936954]])