### Data Collection : Dataset from sklearn.dataset

In [1]:
# Import dataset module.
from sklearn.datasets import fetch_20newsgroups

# Fetch 20 news groups training dataset.
# subset : Select "test" or "train" dataset.
#          We extract "train" dataset. 
news_groups_train = fetch_20newsgroups(subset="train")

# Show keys. 
# data        : Article data
# target_names: Category names. 
# target      : Labeled(categorized) data. Labeled in replaced numbers. 
news_groups_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [2]:
# Get article content.
news_groups_train.data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [3]:
# Get numerically labeled number of category.
news_groups_train.target

array([7, 4, 4, ..., 3, 1, 8])

In [4]:
# Get target(category) names.
news_groups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

### Transforming from text to vector : TF-IDF vectorization

In [5]:
# Import CountVectorizer class. 
# CountVectorizer converts text data to matrix of token counts.
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

# fit_transform() creates dictionary and return term-document matrix.
X_train_counts = count_vector.fit_transform(news_groups_train.data)

# Import TfidfTransformer class.
# TfidfTransformer transoforms count matrix to tf-idf representation.
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

# fit_transform transforms count matrix to tf-idf representation(vector).
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

### Create naive baeys classifier model

In [6]:
# Create model(naive bayes) and training. 
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, news_groups_train.target)

### Run Classification & check results

In [7]:
# Create test documents.
docs_new = [
    'God is love', 
    'OpenGL on the GPU is fast', 
    "United states goes to Iraq",
]

# Transfroming.
X_new_counts = count_vector.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

# Execute prediction(classification).
predicted = clf.predict(X_new_tfidf)

# Show predicted data.
for doc, category in zip(docs_new, predicted):
    print("{0} => {1}".format(doc, news_groups_train.target_names[category]))

God is love => soc.religion.christian
OpenGL on the GPU is fast => rec.autos
United states goes to Iraq => talk.politics.mideast
