In [4]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn import decomposition
from scipy import linalg
import matplotlib.pyplot as plt

In [5]:
%matplotlib inline
np.set_printoptions(suppress=True)

In [6]:
categories = ['comp.os.ms-windows.misc', 'rec.autos', 'sci.electronics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove)

In [7]:
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove)

In [8]:
newsgroups_train.filenames.shape, newsgroups_test.filenames.shape

((2369,), (1577,))

In [9]:
newsgroups_train.filenames.shape[0]

2369

In [10]:
random_index = np.random.randint(0, newsgroups_train.filenames.shape[0]-1, 3)
random_index

array([1734, 1261, 1727])

In [11]:
print("\n#----------#\n".join(np.array(newsgroups_train.data)[random_index]))


Careful now folks... Also consider the 90VAC+ @20Hz that is forced on ring
and tip when the phone's supposed to ring!  Even with a simple zener
and LED setup, you might end up with some carbon real quick.  Whatever
scheme you use, make sure you've got at least 200V-rated components on 
the frontend.

Also remember that, if I'm not mistaken, the phone line is a 600ohm
equivalent circuit.  Any current you draw from the 48V or so gets
dropped across that 600ohms.  That's fine until you're down to roughly
12V, when Ma Bell considers it to be off-hook.  But dropping it that
far down is probably a big no-no.

The easiest implementation to accomplish the above??

    tip  ------->|-----\/\/\/\-----+----------+
              rectifier  resistor  |          |
              diode                |          \ 
                                   V          /
                          zener  /---/        \  resistor
                                   |          /
                                   

In [12]:
np.array(newsgroups_train.target_names)[newsgroups_train.target[random_index]]

array(['sci.electronics', 'rec.autos', 'comp.os.ms-windows.misc'],
      dtype='<U23')

In [13]:
newsgroups_train.target[:10]

array([0, 0, 0, 2, 0, 2, 2, 1, 1, 0], dtype=int64)

In [14]:
num_topics, num_top_words = 6, 8

# Stop words

In [15]:
from sklearn.feature_extraction import stop_words
sorted(list(stop_words.ENGLISH_STOP_WORDS))[:20]

['a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amoungst']

# Stemming and Lemmatization

In [16]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PrilepskiyR\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
from nltk import stem

In [18]:
wnl = stem.WordNetLemmatizer()
porter = stem.porter.PorterStemmer()

In [19]:
word_list = ['feet', 'foot', 'foots', 'footing']

In [20]:
[wnl.lemmatize(word) for word in word_list]

['foot', 'foot', 'foot', 'footing']

In [21]:
[porter.stem(word) for word in word_list]

['feet', 'foot', 'foot', 'foot']

## Small tasks

In [22]:
def get_lem_stem(w_list):
    w_list_lemmas = [wnl.lemmatize(word) for word in w_list]
    w_list_stemmed = [porter.stem(word) for word in w_list]
    
    print("Word list: {}".format(w_list))
    print("Lemmas list: {}".format(w_list_lemmas))
    print("Stemmed list: {}".format(w_list_stemmed))
    
    return w_list_lemmas, w_list_stemmed

In [23]:
t1 = ["fly", "flies", "flying"]
t2 = ["organize", "organizes", "organizing"]
t3 = ["universe", "university"]

t = [t1, t2, t3]

In [24]:
for w_list in t:
    _, _ = get_lem_stem(w_list)
    print()

Word list: ['fly', 'flies', 'flying']
Lemmas list: ['fly', 'fly', 'flying']
Stemmed list: ['fli', 'fli', 'fli']

Word list: ['organize', 'organizes', 'organizing']
Lemmas list: ['organize', 'organizes', 'organizing']
Stemmed list: ['organ', 'organ', 'organ']

Word list: ['universe', 'university']
Lemmas list: ['universe', 'university']
Stemmed list: ['univers', 'univers']



# Spacy

In [33]:
#!python -m spacy download en_core_web_sm

In [1]:
import spacy

In [10]:
from spacy.lemmatizer import Lemmatizer

In [None]:
from spacy.lang.en import 

In [17]:
from spacy.language import Lookups

In [20]:
Lookups.tables

<property at 0x1c5e77d8ea8>

In [2]:
nlp = spacy.load("en_core_web_sm")

In [11]:
lookups = nlp.vocab.lookups.tables

In [13]:
lookups

['lemma_lookup', 'lemma_rules', 'lemma_index', 'lemma_exc']

In [15]:
nlp.vocab.lookups.get_table('lemma_lookup')

Table([(2216152597797699947, 'because'),
       (11195369525898364697, 'would'),
       (1634435218575327026, 'them'),
       (5816706412157836887, 'will'),
       (8504683628951808721, 'be'),
       (13149186524847800786, 'and'),
       (16428057658620181782, 'have'),
       (3670149026463577134, 'until'),
       (2989924464908128948, 'have'),
       (12896981843026651581, 'A-bomb'),
       (4076275946414587309, 'AGM'),
       (14289757955295490859, 'Aborigine'),
       (9234253121641529843, 'Admiralty'),
       (6960077336727894223, 'Afghan'),
       (1537038473495469427, 'African'),
       (17536175136822184822, 'Afrikaner'),
       (16471051286479716561, 'Afro'),
       (11115041678086105894, 'Albanian'),
       (8871703708707614654, 'Algerian'),
       (2184881304099978502, 'Alsatian'),
       (13728823665023535747, 'Americanism'),
       (1381591446222365994, 'Americanize'),
       (18162407286073885813, 'Americanize'),
       (8057550804358256442, 'Americanize'),
       (1101150

In [12]:
lemmatizer = Lemmatizer(lookups)

ValueError: [E173] As of v2.2, the Lemmatizer is initialized with an instance of Lookups containing the lemmatization tables. See the docs for details: https://spacy.io/api/lemmatizer#init

In [28]:
[lemmatizer.lookup(word) for word in word_list]

NameError: name 'lemmatizer' is not defined

In [40]:
nlp = spacy.load("en_core_web_sm")

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [None]:
sorted