In [12]:
from sklearn.datasets import fetch_20newsgroups

news_data = fetch_20newsgroups(subset = 'all', random_state = 156)

#어떤 key값을 가지고 있는지 확인
print(news_data.keys())

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [13]:
import pandas as pd

print("target 클래스의 값과 분포도 : \n",pd.Series(news_data.target).value_counts().sort_index())
print("target 클래스의 이름들 : \n",news_data.target_names)

target 클래스의 값과 분포도 : 
 0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64
target 클래스의 이름들 : 
 ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [14]:
print(news_data.data[0])

From: egreen@east.sun.com (Ed Green - Pixel Cruncher)
Subject: Re: Observation re: helmets
Organization: Sun Microsystems, RTP, NC
Lines: 21
Distribution: world
Reply-To: egreen@east.sun.com
NNTP-Posting-Host: laser.east.sun.com

In article 211353@mavenry.altcit.eskimo.com, maven@mavenry.altcit.eskimo.com (Norman Hamer) writes:
> 
> The question for the day is re: passenger helmets, if you don't know for 
>certain who's gonna ride with you (like say you meet them at a .... church 
>meeting, yeah, that's the ticket)... What are some guidelines? Should I just 
>pick up another shoei in my size to have a backup helmet (XL), or should I 
>maybe get an inexpensive one of a smaller size to accomodate my likely 
>passenger? 

If your primary concern is protecting the passenger in the event of a
crash, have him or her fitted for a helmet that is their size.  If your
primary concern is complying with stupid helmet laws, carry a real big
spare (you can put a big or small head in a big helmet, bu

In [15]:
#학습용 데이터의 내용만 추출
train_news = fetch_20newsgroups(subset = 'train', remove=('headers','footers','quotes'),random_state=156)

X_train = train_news.data
y_train = train_news.target

In [16]:
#테스트 데이터의 내용만 추출
test_news = fetch_20newsgroups(subset = 'test',remove=('headers','footers','quotes'),random_state=156)

X_test = test_news.data
y_test = test_news.target

print("학습 데이터 크기 : {}".format(len(train_news.data)))
print("테스트 데이터 크기 : {}".format(len(test_news.data)))

학습 데이터 크기 : 11314
테스트 데이터 크기 : 7532


## 피처 벡터화 변환과 머신러닝 모델 학습/예측/평가

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

#피처 벡터화 변환 수행
cnt_vect = CountVectorizer()
cnt_vect.fit(X_train)
X_train_cnt_vect = cnt_vect.transform(X_train)

#학습데이터로 생성된 CountVectirizer를 이용해 테스트 데이터 피처 벡터화 변환
X_test_cnt_vect = cnt_vect.transform(X_test)


print("학습 데이터 CountVectorizer Shape : ",X_train_cnt_vect.shape)

학습 데이터 CountVectorizer Shape :  (11314, 101631)


#### Count 기반

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#로지스틱회귀분석으로 학습
lr = LogisticRegression()
lr.fit(X_train_cnt_vect,y_train)

#예측
lr_pred = lr.predict(X_test_cnt_vect)

#평가 
print('CountVectorized Logistic Regression 예측 정확도 : {0:3f}'.format(accuracy_score(y_test,lr_pred)))

CountVectorized Logistic Regression 예측 정확도 : 0.607408


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### TF-IDF 기반

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

#TF-IDF 벡터화 적용
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

In [20]:
#로지스틱회귀 적용
lr = LogisticRegression()
lr.fit(X_train_tfidf_vect,y_train)
pred = lr.predict(X_test_tfidf_vect)
print('TF-IDF Logistic Regression 예측 정확도 : {0:3f}'.format(accuracy_score(y_test,pred)))

TF-IDF Logistic Regression 예측 정확도 : 0.673659


In [21]:
#stop words 필터링 추가 후, n_gram을 (1,2)로 변경
tfidf_vect = TfidfVectorizer(stop_words='english',ngram_range = (1,2),max_df = 300)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train_tfidf_vect,y_train)

pred = lr.predict(X_test_tfidf_vect)
print("TF-IDF 로지스틱 회귀의 예측 정확도 : {0:.3f}".format(accuracy_score(y_test,pred)))

TF-IDF 로지스틱 회귀의 예측 정확도 : 0.692


#### GridSearch

In [22]:
from sklearn.model_selection import GridSearchCV

#최적 C값 도출 튜닝 수행 및 CV는 3 폴드 셋 진행

params = {'C':[0.01,0.1,1,5,10]}
grid_cv_lr = GridSearchCV(lr,param_grid = params,cv=3, scoring = 'accuracy',verbose=1)
grid_cv_lr.fit(X_train_tfidf_vect,y_train)
print("로지스틱 회귀의 best C Parameter : ",grid_cv_lr.best_params_)

#최적 C값으로 학습된 grid_cv로 예측 및 정확도 평가
pred = grid_cv_lr.predict(X_test_tfidf_vect)
print("TF-IDF Vectorized Logistic Regression의 정확도 : {0:3f}".format(accuracy_score(y_test,pred)))

Fitting 3 folds for each of 5 candidates, totalling 15 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

로지스틱 회귀의 best C Parameter :  {'C': 10}
TF-IDF Vectorized Logistic Regression의 정확도 : 0.701009


## 사이킷런 파이프라인 사용 및 GridSearchCV와의 결합

In [26]:
from sklearn.pipeline import Pipeline

#TfidfVecorizer 객체를 tfidf_vect로, 로지스틱 회귀 객체를 lr로 생성하는 pipeline

pipeline = Pipeline([('tfidf_vect',TfidfVectorizer(stop_words = 'english',ngram_range = (1,2),max_df=300)),
                    ('lr',LogisticRegression(C=10))])

In [27]:
#별도의 TfidfVectorizer객체의 fit, transform과 LogisticRegression의 fit,predict가 필요없음
#pipeline의 fit과 predict만으로 한번에 피처 벡터화와 ML 학습/예측 가능

pipeline.fit(X_train,y_train)
pred = pipeline.predict(X_test)
print("pipeline을 통한 Logistic Regression의 예측 정확도 : {0:.3f}".format(accuracy_score(y_test,pred)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


pipeline을 통한 Logistic Regression의 예측 정확도 : 0.701


In [None]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('tfidf_vect',TfidfVectorizer(stop_words='english')),
                    ('lr',LogisticRegression())])

#Pipeline의 각각의 객체 변수에 언더바(_) 2개를 연달아 붙여 GridSearchCV에 사용될 파라미터,하이퍼파라미터 이름과 값 설정
params = {'tfidf_vect__ngram_range':[(1,1),(1,2),(1,3)],
         'tfidf_vect__max_df': [100,300,700],
         'lr__C':[1,5,10]}


#GridSearchCV의 생성자에 Estimator가 아닌 Pipeline객체 입력
grid_cv_pipe = GridSearchCV(pipeline,param_grid=params,cv=3,scoring='accuracy',verbose=1)
grid_cv_pipe.fit(X_train,y_train)
print(grid_cv_pipe.best_params_,grid_cv_pipe.best_score_)

pred = grid_cv_pipe.predict(X_test)
print('Pipeline을 통한 Logistic Regression의 예측 정확도 : {0:.3f}'.format(accuracy_score(y_test,pred)))

Fitting 3 folds for each of 27 candidates, totalling 81 fits
