In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# dataset used to categorize news articles
newsgroups_train = fetch_20newsgroups(subset='train')

In [3]:
newsgroups_test = fetch_20newsgroups(subset='test')

In [4]:
# categories
list(newsgroups_train.target_names)

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
# custom categories
cats = [
    'Atheism',
    'Graphics',
    'Microsoft',
    'PC hardware',
    'Apple',
    'Windows',
    'For Sale',
    'Automobiles',
    'Motorcycles',
    'Baseball',
    'Hockey',
    'Security',
    'Electronics',
    'Medicine',
    'Space',
    'Religion',
    'Weapons',
    'Middle East',
    'Politics',
    'Religious Talks'
]

In [6]:
len(newsgroups_train.data)

11314

In [7]:
len(newsgroups_test.data)

7532

In [8]:
newsgroups_train.data[10]

'From: irwin@cmptrc.lonestar.org (Irwin Arnstein)\nSubject: Re: Recommendation on Duc\nSummary: What\'s it worth?\nDistribution: usa\nExpires: Sat, 1 May 1993 05:00:00 GMT\nOrganization: CompuTrac Inc., Richardson TX\nKeywords: Ducati, GTS, How much? \nLines: 13\n\nI have a line on a Ducati 900GTS 1978 model with 17k on the clock.  Runs\nvery well, paint is the bronze/brown/orange faded out, leaks a bit of oil\nand pops out of 1st with hard accel.  The shop will fix trans and oil \nleak.  They sold the bike to the 1 and only owner.  They want $3495, and\nI am thinking more like $3K.  Any opinions out there?  Please email me.\nThanks.  It would be a nice stable mate to the Beemer.  Then I\'ll get\na jap bike and call myself Axis Motors!\n\n-- \n-----------------------------------------------------------------------\n"Tuba" (Irwin)      "I honk therefore I am"     CompuTrac-Richardson,Tx\nirwin@cmptrc.lonestar.org    DoD #0826          (R75/6)\n-------------------------------------------

In [9]:
cats[newsgroups_train.target[10]]

'Motorcycles'

In [10]:
#vectorizer to vectorize data
tfidf = TfidfVectorizer()

In [11]:
train_vectors = tfidf.fit_transform(newsgroups_train.data)
test_vectors = tfidf.transform(newsgroups_test.data)

In [12]:
train_vectors.shape

(11314, 130107)

In [13]:
train_vectors.nnz / float(train_vectors.shape[0])

157.9958458546933

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [15]:
model = MultinomialNB(alpha=0.1)

In [16]:
model.fit(train_vectors, newsgroups_train.target)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [17]:
pred = model.predict(test_vectors)

In [18]:
metrics.accuracy_score(newsgroups_test.target, pred)

0.8263409453000531

In [19]:
test_article = "Gunmen stormed Kabul University on Monday as it hosted a book fair attended by the Iranian ambassador to Afghanistan, sparking an hours-long gun battle and leaving at least 19 dead and 22 wounded at the war-torn country's largest school. Three attackers involved in the assault were killed in the ensuing firefight in Kabul."

In [21]:
vector = tfidf.transform([test_article])

In [22]:
model.predict(vector)

array([17])

In [23]:
cats[17]

'Middle East'

In [24]:
from joblib import dump

In [25]:
dump(model, 'news_classifier.joblib')

['news_classifier.joblib']