# 20 뉴스 그룹 분류

In [112]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import re

In [2]:
news = fetch_20newsgroups(subset='all', random_state=2021)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


### 데이터 탐색

In [3]:
news.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [5]:
news.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [7]:
type(news.target), len(news.target)

(numpy.ndarray, 18846)

In [8]:
pd.Series(news.target).value_counts().sort_index()

0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64

In [12]:
print(news.data[0])

From: dagibbs@quantum.qnx.com (David Gibbs)
Subject: Re: Countersteering sans Hands
Organization: QNX Software Systems, Ltd.
Lines: 22

In article <1993Apr20.203344.8417@cs.cornell.edu> karr@cs.cornell.edu (David Karr) writes:
>In article <Clarke.6.735328328@bdrc.bd.com> Clarke@bdrc.bd.com (Richard Clarke) writes:
>>So how do I steer when my hands aren't on the bars? (Open Budweiser in left 
>>hand, Camel cigarette in the right, no feet allowed.) 
>
>>If I lean, and the 
>>bike turns, am I countersteering?
>
>No, the bars would turn only *toward* the direction of turn in
>no-hands steering.

Just in case the original poster was looking for a serious answer,
I'll supply one.

Yes, even when steering no hands you do something quite similar
to countersteering.  Basically to turn left, you to a quick wiggle
of the bike to the right first, causing a counteracting lean to
occur to the left.  It is a lot more difficult to do on a motorcycle
than a bicycle though, because of the extra weight. 

## 훈련/테스트용 데이터 추출

In [84]:
train_news = fetch_20newsgroups(subset='train', random_state=2021, remove=['headers','footers','quotes'])
len(train_news.data)

11314

In [85]:
test_news = fetch_20newsgroups(subset='test', random_state=2021, remove=['headers','footers','quotes'])
len(test_news.data)

7532

In [86]:
print(train_news.data[0])


Stop! Hold it! You have a few problems here. Official history says that 
the first accusations of homosexuality in the SA came from OUTSIDE of the Nazi 
party, long BEFORE the Nazis ever came to power. So this objection is a red
herring, even if established history is wrong on this point. Moreover, none of 
the histories I've read ever made mention of Hitler or anyone else ever using 
homosexuality as a pretext for purging Roehm. A point I saw reiterated was that
Hitler and the party covered up these accusations. If you are going to accuse
official history of being a fabrication, you should at least get your facts
right. The pretext for purging Roehm was that he was planning to use the SA in
a coup against Hitler. Nowhere is there mention of using allegations of
homosexuality as a pretext for the purge, nor as a justification afterwards (it
is possible that the histories I've read have not mentioned this, but I doubt
it - would it be in Hitler's best interest to admit to the world tha

## 텍스트 데이터 전처리

In [102]:
train_df = pd.DataFrame({'article' : train_news.data})
test_df = pd.DataFrame({'article' : test_news.data})

In [103]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11314 entries, 0 to 11313
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   article  11314 non-null  object
dtypes: object(1)
memory usage: 88.5+ KB


In [104]:
train_df['article'] = train_df['article'].astype(str).apply(lambda x : re.sub('r[^A-Za-z0-9 ]', '', x))
test_df['article'] = test_df['article'].astype(str).apply(lambda x : re.sub('r[^A-Za-z0-9 ]', '', x))

In [105]:
train_df.article[0]

'\nStop! Hold it! You have a few problems here. Official history says that \nthe first accusations of homosexuality in the SA came from OUTSIDE of the Nazi \nparty, long BEFORE the Nazis ever came to powe So this objection is a red\nherring, even if established history is wrong on this point. Moreove none of \nthe histories I\'ve read ever made mention of Hitler or anyone else ever using \nhomosexuality as a pretext for purging Roehm. A point I saw reiterated was that\nHitler and the party covered up these accusations. If you are going to accuse\nofficial history of being a fabrication, you should at least get your facts\nright. The pretext for purging Roehm was that he was planning to use the SA in\na coup against Hitle Nowhere is there mention of using allegations of\nhomosexuality as a pretext for the purge, nor as a justification afterwards (it\nis possible that the histories I\'ve read have not mentioned this, but I doubt\nit - would it be in Hitles best interest to admit to the w

In [98]:
# train_df['article'] = train_df.article.apply(lambda x : ' '.join([w for w in x.split() if len(w) > 3]))

In [100]:
# train_df['article'] = train_df.article.apply(lambda x : x.lower())

## 텍스트 변환

In [107]:
tvect = TfidfVectorizer(stop_words='english')
tvect.fit(train_df.article)
X_train = tvect.transform(train_df.article)
X_test = tvect.transform(test_df.article)

In [108]:
y_train = train_news.target
y_test = test_news.target

In [111]:
svc = SVC()
svc.fit(X_train, y_train)


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [113]:
pred = svc.predict(X_test)
score = accuracy_score(y_test, pred)

In [114]:
score

0.6759160913436006