In [179]:
import sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, HashingVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.svm import LinearSVC
import numpy as np
import spacy
from numpy import savetxt

In [207]:
data = fetch_20newsgroups(subset="all", remove=('headers', 'footers', 'quotes'))
#data = fetch_20newsgroups(subset="all")

In [215]:
data.target_names #categories

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [98]:
data.filenames.shape #file names

(18846,)

In [99]:
dir(data) #calls you can make on data

['DESCR', 'data', 'filenames', 'target', 'target_names']

In [204]:
data.data[5]

'From: tell@cs.unc.edu (Stephen Tell)\nSubject: Re: subliminal message flashing on TV\nOrganization: The University of North Carolina at Chapel Hill\nLines: 25\nNNTP-Posting-Host: rukbat.cs.unc.edu\n\nIn article <7480237@hpfcso.FC.HP.COM> myers@hpfcso.FC.HP.COM (Bob Myers) writes:\n>> Hi.  I was doing research on subliminal suggestion for a psychology\n>> paper, and I read that one researcher flashed hidden messages on the\n>> TV screen at 1/200ths of a second.  Is that possible?\n\n> Might\n>even be a vector ("strokewriter") display, in which case the lower limit\n>on image time is anyone\'s guess (and is probably phosphor-persistence limited).\n\nBack in high school I worked as a lab assistant for a bunch of experimental\npsychologists at Bell Labs.  When they were doing visual perception and\nmemory experiments, they used vector-type displays, with 1-millisecond\nrefresh rates common.\n\nSo your case of 1/200th sec is quite practical, and the experimenters were\nprobably sure that i

In [205]:
data.target[5] #groups each article belongs to

12

In [208]:
%%time
nlp = spacy.load("en_core_web_md") #load spacy engligh

CPU times: user 14.3 s, sys: 722 ms, total: 15 s
Wall time: 15.1 s


In [209]:
from spacy.lang.en.stop_words import STOP_WORDS
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
print('Number of stop words: %d' % len(spacy_stopwords))
print('First ten stop words: %s' % list(spacy_stopwords)[:10])

Number of stop words: 326
First ten stop words: ['and', 'everything', "'d", 'regarding', 'thus', 'your', 'whence', 'which', 'fifty', 'other']


In [210]:
customize_stop_words = [
    '\n', '\n\n', '\n\n\n', '\n\n\n\n'
]
for w in customize_stop_words:
    nlp.vocab[w].is_stop = True

In [211]:
#clean articles stop words, spaces, and puncuation removed
for i in range(len(data.data)):
    doc = nlp(data.data[i])

    data.data[i] = [token.text for token in doc if not token.is_stop |token.is_punct |token.is_space]
    # print('Original Article: %s' % (data.data[0]))
    # print()
    # print(tokens)

In [212]:
data.data[0] #example of article after being cleaned

['sure',
 'bashers',
 'Pens',
 'fans',
 'pretty',
 'confused',
 'lack',
 'kind',
 'posts',
 'recent',
 'Pens',
 'massacre',
 'Devils',
 'Actually',
 'bit',
 'puzzled',
 'bit',
 'relieved',
 'going',
 'end',
 'non',
 'PIttsburghers',
 'relief',
 'bit',
 'praise',
 'Pens',
 'Man',
 'killing',
 'Devils',
 'worse',
 'thought',
 'Jagr',
 'showed',
 'better',
 'regular',
 'season',
 'stats',
 'lot',
 'fo',
 'fun',
 'watch',
 'playoffs',
 'Bowman',
 'let',
 'JAgr',
 'lot',
 'fun',
 'couple',
 'games',
 'Pens',
 'going',
 'beat',
 'pulp',
 'Jersey',
 'disappointed',
 'Islanders',
 'lose',
 'final',
 'regular',
 'season',
 'game',
 'PENS',
 'RULE']

In [213]:
#changes articles back into string
clean_articles=list()
for i in range(len(data.data)):
    temp=' '.join(word for word in data.data[i])
    clean_articles.append(temp)

In [218]:
clean_articles[5]

'high school worked lab assistant bunch experimental psychologists Bell Labs visual perception memory experiments vector type displays 1-millisecond refresh rates common case 1/200th sec practical experimenters probably sure 5 milliseconds 4 6 Steve'

In [219]:
text = clean_articles
target = data["target"]
print("The following are the 20 topics that an article can belong to:")
print(data["target_names"])

The following are the 20 topics that an article can belong to:
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [220]:
X_train, X_test, y_train, y_test = train_test_split(text, target, random_state=0)

In [221]:
print(f"The training dataset contains {len(X_train)} articles.")
print(f"The test dataset contains {len(X_test)} articles.")

The training dataset contains 14134 articles.
The test dataset contains 4712 articles.


In [222]:
#vectorizes each article into a (300,1) vector
X_train_glove= np.array([nlp(text).vector for text in X_train])
X_test_glove= np.array([nlp(text).vector for text in X_test])

In [182]:
#above is the spacy glove tokenization of the words. could try many others to see how the results come out. going to pass this data to R and see how it does with clusting agoritms

savetxt('X_train.csv', X_train_glove, delimiter=',')

In [183]:
savetxt('X_test.csv', X_test_glove, delimiter=',')
savetxt('y_test.csv', y_test, delimiter=',')
savetxt('y_train.csv', y_train, delimiter=',')

In [217]:
X_test[0]


'Uh slight clarification printer driver c.itoh LIPS10 laser printer Thanks'

In [13]:
data.target.shape #target categories groups

(18846,)

In [20]:
%%time
# Use English stopwords and produce a BoW representation for the data using up to trigrams
# Save the vectorizer as counter and the transformed data as X_train_bow, and X_test_bow
# YOUR CODE HERE
counter = CountVectorizer(stop_words='english',ngram_range=(1, 3)).fit(X_train, y_train)
X_train_bow = counter.transform(X_train)
X_test_bow=counter.transform(X_test)

CPU times: user 17.8 s, sys: 483 ms, total: 18.3 s
Wall time: 18.3 s


In [38]:
test=X_test_bow.toarray()
test.shape

(4712, 2262337)

In [41]:
np.sum(test[0,:])

11

In [21]:
%%time
# Use the BoW representation you just created above to produce a TFIDF representation of the data
# Save the transformer to tfidfer and the transformed data as X_train_tfidf, and X_test_tfidf

# YOUR CODE HERE
tfidfer=TfidfTransformer()
X_train_tfidf=tfidfer.fit_transform(X_train_bow)
X_test_tfidf=tfidfer.transform(X_test_bow)

CPU times: user 1.11 s, sys: 96.1 ms, total: 1.2 s
Wall time: 1.2 s


scipy.sparse.csr.csr_matrix