# News Group Classification with Naive Bayes

## 1. Import Dataset

In [2]:
from sklearn.datasets import fetch_20newsgroups
newsdata = fetch_20newsgroups(subset='train')

In [3]:
newsdata.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [4]:
newsdata.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [14]:
len(newsdata.data)

11314

In [5]:
newsdata.data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

## 2. Data Preprocessing

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

#### tokenize for NB

DTM

In [7]:
dtmvector = CountVectorizer()
X_train_dtm = dtmvector.fit_transform(newsdata.data)

In [11]:
# 11314 texts, 130107 words
X_train_dtm.shape

(11314, 130107)

TF-IDF (가중치 부여 -> 성능 개선)

In [15]:
tfidf_transformer = TfidfTransformer()
tfidv = tfidf_transformer.fit_transform(X_train_dtm)

In [17]:
tfidv.shape

(11314, 130107)

## 3. Modeling

In [19]:
mod = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) # alpha=1 : Laplace Smoothing
mod.fit(tfidv, newsdata.target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
newsdata_test = fetch_20newsgroups(subset='test', shuffle=True)
X_test_dtm = dtmvector.transform(newsdata_test.data)
tfidv_test = tfidf_transformer.transform(X_test_dtm)

In [21]:
predicted = mod.predict(tfidv_test)

In [22]:
predicted

array([ 7, 11,  0, ...,  9,  3, 15])

In [23]:
newsdata_test.target

array([ 7,  5,  0, ...,  9,  6, 15])

In [24]:
accuracy_score(newsdata_test.target, predicted)

0.7738980350504514