In [1]:
# import basic perceptron
from sklearn.linear_model import Perceptron
# import dataset
from sklearn.datasets import fetch_20newsgroups

In [2]:
# limit categories
categories = ['alt.atheism', 'sci.med']
# obtain documents
train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True)
train.data[0]

'From: sandvik@newton.apple.com (Kent Sandvik)\nSubject: Re: New Member\nOrganization: Cookamunga Tourist Bureau\nLines: 20\n\nIn article <C5HIEw.7s1@portal.hq.videocart.com>,\ndfuller@portal.hq.videocart.com (Dave Fuller) wrote:\n>   He is right. Just because an event was explained by a human to have been\n> done "in the name of religion", does not mean that it actually followed\n> the religion. He will always point to the "ideal" and say that it wasn\'t\n> followed so it can\'t be the reason for the event. There really is no way\n> to argue with him, so why bother. Sure, you may get upset because his \n> answer is blind and not supported factually - but he will win every time\n> with his little argument. I don\'t think there will be any postings from\n> me in direct response to one of his.\n\nHey! Glad to have some serious and constructive contributors in this\nnewsgroup. I agree 100% on the statement above, you might argue with\nBobby for eons, and he still does not get it, so the b

In [3]:
# define perceptron
perceptron = Perceptron(max_iter=100, verbose=0, random_state=None, fit_intercept=True)

In [4]:
# convert a collection of text documents to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_train_counts = cv.fit_transform(train.data)
X_train_counts.shape

(1074, 22735)

In [8]:
# compute TFIDF representations of count vectors
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_tf = TfidfTransformer()
X_train_tfidf = tfidf_tf.fit_transform(X_train_counts)

In [9]:
# perceptron is trained on TFIDF vectors
perceptron.fit(X_train_tfidf,train.target)

Perceptron(max_iter=100, random_state=None)

In [10]:
# test data
test_docs = ['Religion is widespread, even in modern times', 'His kidney failed','The pope is a controversial leader', 'White blood cells fight off infections','The reverend had a heart attack in church']

# test data is vectorized
X_test_counts = cv.transform(test_docs)
X_test_tfidf = tfidf_tf.transform(X_test_counts)

In [12]:
# predict result
pred = perceptron.predict(X_test_tfidf)
for doc, category in zip(test_docs, pred):
    print('%r => %s' % (doc, train.target_names[category]))

'Religion is widespread, even in modern times' => alt.atheism
'His kidney failed' => sci.med
'The pope is a controversial leader' => alt.atheism
'White blood cells fight off infections' => sci.med
'The reverend had a heart attack in church' => sci.med
