In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn import decomposition
from scipy import linalg
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
np.set_printoptions(suppress=True)

In [3]:
categories = ['comp.os.ms-windows.misc', 'rec.autos', 'sci.electronics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove)

In [4]:
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove)

In [5]:
newsgroups_train.filenames.shape, newsgroups_test.filenames.shape

((2369,), (1577,))

In [6]:
newsgroups_train.filenames.shape[0]

2369

In [7]:
random_index = np.random.randint(0, newsgroups_train.filenames.shape[0]-1, 3)
random_index

array([1753, 2193, 1553])

In [8]:
print("\n\n#================================#\n\n".join(np.array(newsgroups_train.data)[random_index]))

  You can try defraging your disk more often. It definitely will
 help speed things up. A 2 megs smartdrv is also a good idea with
 the amount of memory you have, and use fastdisk (32bit access) if
 you not already. Hope that help..


C >Hi:
C >I have a 486DX2-66MHz computer to use with an A/D board
C >for data acquisition on an AT bus...I'm having problems.
C >The AT bus runs at 12.5 MHz - correct?  So there should
C >be no bus speed conflict. But I read somewhere that the
C >new 486DX2-66 MHz CPU runs on a 33 MHz bus - is that for
C >the local bus or the AT bus also - if so then I have a problem.
C >^^^^^^^^^^^^^^^^^^^^^^^^^^^
C >When I run on non-turbo-mode the speed goes to 8 MHz and the
C >A/D doesn't work.  Please mail your views!  Thanks.
C >Vincent
C >cyl5@musica.mcgill.ca
C >

The STANDARD AT bus (ISA) runs at 8MHz, not 12.5 MHz, but some 
non-stnadard ISA buses do have higher clock rates, but be careful, since 
some boards don't work with faster than standard rates.  For inst

In [9]:
np.array(newsgroups_train.target_names)[newsgroups_train.target[random_index]]

array(['comp.os.ms-windows.misc', 'sci.electronics', 'sci.space'],
      dtype='<U23')

In [10]:
newsgroups_train.target[:10]

array([0, 0, 0, 2, 0, 2, 2, 1, 1, 0])

In [11]:
num_topics, num_top_words = 6, 8

# Stop words

In [12]:
from sklearn.feature_extraction import stop_words
sorted(list(stop_words.ENGLISH_STOP_WORDS))[:20]

['a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amoungst']

# Stemming and Lemmatization

In [13]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/rprilepskiy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
from nltk import stem

In [15]:
wnl = stem.WordNetLemmatizer()
porter = stem.porter.PorterStemmer()

In [16]:
word_list = ['feet', 'foot', 'foots', 'footing']

In [17]:
[wnl.lemmatize(word) for word in word_list]

['foot', 'foot', 'foot', 'footing']

In [18]:
[porter.stem(word) for word in word_list]

['feet', 'foot', 'foot', 'foot']

## Small tasks

In [19]:
def get_lem_stem(w_list):
    w_list_lemmas = [wnl.lemmatize(word) for word in w_list]
    w_list_stemmed = [porter.stem(word) for word in w_list]
    
    print("Word list: {}".format(w_list))
    print("Lemmas list: {}".format(w_list_lemmas))
    print("Stemmed list: {}".format(w_list_stemmed))
    
    return w_list_lemmas, w_list_stemmed

In [20]:
t1 = ["fly", "flies", "flying"]
t2 = ["organize", "organizes", "organizing"]
t3 = ["universe", "university"]

t = [t1, t2, t3]

In [21]:
for w_list in t:
    _, _ = get_lem_stem(w_list)
    print()

Word list: ['fly', 'flies', 'flying']
Lemmas list: ['fly', 'fly', 'flying']
Stemmed list: ['fli', 'fli', 'fli']

Word list: ['organize', 'organizes', 'organizing']
Lemmas list: ['organize', 'organizes', 'organizing']
Stemmed list: ['organ', 'organ', 'organ']

Word list: ['universe', 'university']
Lemmas list: ['universe', 'university']
Stemmed list: ['univers', 'univers']



# Spacy

In [22]:
# !python -m spacy download en_core_web_sm

In [23]:
import spacy

In [24]:
from spacy.lemmatizer import Lemmatizer

In [25]:
from spacy.language import Lookups

In [26]:
from spacy.lookups import Lookups

In [27]:
nlp = spacy.load("en_core_web_sm")

In [28]:
lookups = Lookups() #nlp.vocab.lookups.tables

In [29]:
# nlp.vocab.lookups.get_table('lemma_lookup')

In [30]:
lemmatizer = Lemmatizer(lookups)

In [31]:
[lemmatizer.lookup(word) for word in word_list]

['feet', 'foot', 'foots', 'footing']

In [32]:
sorted(list(nlp.Defaults.stop_words))[:20]

["'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also']

# stop words in spacy but not in sklearn

In [33]:
set(nlp.Defaults.stop_words) - set(stop_words.ENGLISH_STOP_WORDS)

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'ca',
 'did',
 'does',
 'doing',
 'just',
 'make',
 "n't",
 'n‘t',
 'n’t',
 'quite',
 'really',
 'regarding',
 'say',
 'unless',
 'used',
 'using',
 'various',
 '‘d',
 '‘ll',
 '‘m',
 '‘re',
 '‘s',
 '‘ve',
 '’d',
 '’ll',
 '’m',
 '’re',
 '’s',
 '’ve'}

# stop words in sklearn but not in spacy

In [34]:
set(stop_words.ENGLISH_STOP_WORDS) - set(nlp.Defaults.stop_words)

{'amoungst',
 'bill',
 'cant',
 'co',
 'con',
 'couldnt',
 'cry',
 'de',
 'describe',
 'detail',
 'eg',
 'etc',
 'fill',
 'find',
 'fire',
 'found',
 'hasnt',
 'ie',
 'inc',
 'interest',
 'ltd',
 'mill',
 'sincere',
 'system',
 'thick',
 'thin',
 'un'}

# Data Processing

In [35]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [36]:
vectorizer = CountVectorizer(stop_words='english')

In [37]:
vectors = vectorizer.fit_transform(newsgroups_train.data).todense()
vectors.shape

(2369, 50766)

In [38]:
print(len(newsgroups_train.data), vectors.shape)

2369 (2369, 50766)


In [39]:
vocab = np.array(vectorizer.get_feature_names())
vocab.shape

(50766,)

In [40]:
vocab[12000:12020]

array(['atmos', 'atmosphere', 'atmospheres', 'atmospheric',
       'atmospherics', 'atn', 'atng1', 'ato', 'atom', 'atomic', 'atomics',
       'atoms', 'atop', 'atr', 'atraction', 'atrophy', 'atru', 'ats',
       'att', 'attach'], dtype='<U81')

# SVD

In [42]:
%time U, s, Vh = linalg.svd(vectors, full_matrices=False)

CPU times: user 2min 58s, sys: 8.47 s, total: 3min 7s
Wall time: 59.1 s


In [43]:
print(U.shape, s.shape, Vh.shape)

(2369, 2369) (2369,) (2369, 50766)


In [44]:
print(vectors.shape)

(2369, 50766)


In [50]:
(U @ np.diag(s) @ Vh).shape

(2369, 50766)

In [51]:
reconstructed_vectors = U @ np.diag(s) @ Vh

In [63]:
np.linalg.norm(reconstructed_vectors - vectors)

7.503692934463036e-11