# Computational Linear Algebra for Coders

## 2. Topic Modeling with NMF and SVD 

### Set up data

In [7]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn import decomposition
from scipy import linalg
import matplotlib.pyplot as plt

In [8]:
%matplotlib inline
np.set_printoptions(suppress=True)

In [None]:
fetch_20newsgroups(subset='test')

In [10]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove)

In [12]:
newsgroups_train.filenames.shape, newsgroups_train.target.shape

((2034,), (2034,))

In [15]:
newsgroups_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR', 'description'])

In [18]:
newsgroups_train['target']

array([1, 3, 2, ..., 1, 0, 1])

In [19]:
np.array(newsgroups_train.target_names)[newsgroups_train.target[:3]]

array(['comp.graphics', 'talk.religion.misc', 'sci.space'], dtype='<U18')

In [20]:
newsgroups_train.target[:3]

array([1, 3, 2])

In [21]:
newsgroups_train.target_names

['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']

In [22]:
np.array(['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc'])[[1, 3, 2]]

array(['comp.graphics', 'talk.religion.misc', 'sci.space'], dtype='<U18')

In [23]:
newsgroups_train.target[:10]

array([1, 3, 2, 0, 2, 0, 2, 1, 2, 1])

In [24]:
num_topics, num_top_words = 6, 8

In [26]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [63]:
vectorizer = CountVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(newsgroups_train.data)

In [64]:
type(vectors)

scipy.sparse.csr.csr_matrix

In [65]:
vectors.nnz / float(vectors.shape[0])

65.70009832841691

In [66]:
vectors = vectors.todense()

In [67]:
type(vectors)

numpy.matrixlib.defmatrix.matrix

In [68]:
print(len(newsgroups_train.data), vectors.shape)

2034 (2034, 26576)


In [69]:
vocab = np.array(vectorizer.get_feature_names())

In [70]:
vocab.shape

(26576,)

In [71]:
vocab[7000:7020]

array(['cosmonauts', 'cosmos', 'cosponsored', 'cost', 'costa', 'costar',
       'costing', 'costly', 'costruction', 'costs', 'cosy', 'cote',
       'couched', 'couldn', 'council', 'councils', 'counsel',
       'counselees', 'counselor', 'count'], dtype='<U80')

### Singular Value Decomposition (SVD)