In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV

# 0. Data Loading and processing

In [2]:
# Load training set and test set
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
newsgroups_test  = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
X_train = newsgroups_train.data
Y_train = newsgroups_train.target
X_test  = newsgroups_test.data
Y_test  = newsgroups_test.target

In [3]:
# Declare two vectorizers
count_vectorizer = CountVectorizer(min_df=40)
tfidf_vectorizer = TfidfVectorizer(min_df=40)

In [4]:
# Fitting vectorizers to the training set
count_vectorizer = count_vectorizer.fit(X_train)
tfidf_vectorizer = tfidf_vectorizer.fit(X_train)

In [5]:
# Transform X_train and X_test using 2 vectorizers
X_train_count = count_vectorizer.transform(X_train)
X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_count  = count_vectorizer.transform(X_test)
X_test_tfidf  = tfidf_vectorizer.transform(X_test)

In [6]:
X_train_count.shape

(2034, 758)

In [7]:
X_train_tfidf.shape

(2034, 758)

In [8]:
print(X_train_count[0,:])

  (0, 18)	1
  (0, 19)	1
  (0, 29)	1
  (0, 35)	1
  (0, 50)	1
  (0, 55)	2
  (0, 61)	2
  (0, 78)	1
  (0, 87)	1
  (0, 113)	1
  (0, 195)	2
  (0, 244)	6
  (0, 253)	1
  (0, 256)	2
  (0, 260)	1
  (0, 272)	1
  (0, 292)	1
  (0, 301)	1
  (0, 316)	2
  (0, 321)	3
  (0, 329)	2
  (0, 337)	3
  (0, 342)	1
  (0, 354)	1
  (0, 374)	1
  (0, 420)	1
  (0, 453)	1
  (0, 455)	1
  (0, 472)	1
  (0, 503)	1
  (0, 533)	1
  (0, 556)	1
  (0, 559)	1
  (0, 647)	2
  (0, 648)	7
  (0, 649)	1
  (0, 657)	1
  (0, 662)	1
  (0, 671)	4
  (0, 703)	1
  (0, 720)	1
  (0, 729)	1
  (0, 733)	1
  (0, 755)	3
  (0, 756)	1


In [9]:
print(X_train_tfidf[0,:])

  (0, 756)	0.0721155370883
  (0, 755)	0.14831743135
  (0, 733)	0.0552321132408
  (0, 729)	0.0869038419929
  (0, 720)	0.0779953088406
  (0, 703)	0.0899136950299
  (0, 671)	0.144021224217
  (0, 662)	0.0499187101099
  (0, 657)	0.0631328198798
  (0, 649)	0.0804180650654
  (0, 648)	0.23180983825
  (0, 647)	0.0836810249613
  (0, 559)	0.093868888692
  (0, 556)	0.136566528013
  (0, 533)	0.096000080666
  (0, 503)	0.12460572882
  (0, 472)	0.0753113616121
  (0, 455)	0.106203923252
  (0, 453)	0.0524819891906
  (0, 420)	0.13397450603
  (0, 374)	0.0705501151537
  (0, 354)	0.0720485235024
  (0, 342)	0.0423196121842
  (0, 337)	0.122195764007
  (0, 329)	0.210611295123
  (0, 321)	0.122719069678
  (0, 316)	0.109162363642
  (0, 301)	0.117608096826
  (0, 292)	0.0509033565847
  (0, 272)	0.105976622957
  (0, 260)	0.0599235329153
  (0, 256)	0.252864456363
  (0, 253)	0.0458215515453
  (0, 244)	0.675533177371
  (0, 195)	0.153717311712
  (0, 113)	0.0527484422433
  (0, 87)	0.0492024228549
  (0, 78)	0.106433094352

# 1. Fitting classifiers with count vectorizer

In [10]:
# Pre-define options
num_folds = 5
num_instances = len(X_train)
seed = 1234
scoring = 'accuracy'

## 1.1. Logistic Regression
다음과 같은 파라미터를 컨트롤하여 모델링해봅시다.
- regulatization: L1, L2
- C

In [11]:
model = LogisticRegression()

penalty_set = ['l1', 'l2']
C_set = [1, 10]
param_grid = dict(penalty=penalty_set, C=C_set)

In [12]:
# Using count vectorizer
clf = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=num_folds, n_jobs=-1, verbose=1)
clf.fit(X_train_count, Y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed:   36.4s remaining:    4.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   39.3s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=1)

In [13]:
clf.cv_results_

{'mean_fit_time': array([  0.6053515 ,   0.57298069,  34.24835596,   0.85566149]),
 'mean_score_time': array([ 0.00112472,  0.00116949,  0.00052738,  0.0006897 ]),
 'mean_test_score': array([ 0.70550639,  0.70698132,  0.68682399,  0.68485742]),
 'mean_train_score': array([ 0.92834308,  0.95341819,  0.97209889,  0.97308252]),
 'param_C': masked_array(data = [1 1 10 10],
              mask = [False False False False],
        fill_value = ?),
 'param_penalty': masked_array(data = ['l1' 'l2' 'l1' 'l2'],
              mask = [False False False False],
        fill_value = ?),
 'params': ({'C': 1, 'penalty': 'l1'},
  {'C': 1, 'penalty': 'l2'},
  {'C': 10, 'penalty': 'l1'},
  {'C': 10, 'penalty': 'l2'}),
 'rank_test_score': array([2, 1, 3, 4], dtype=int32),
 'split0_test_score': array([ 0.72303922,  0.73284314,  0.70098039,  0.70343137]),
 'split0_train_score': array([ 0.92558426,  0.95448954,  0.97109471,  0.97170972]),
 'split1_test_score': array([ 0.70588235,  0.70098039,  0.70343137,  0.

In [14]:
print("Best params: ", clf.best_params_)
print("Best test", scoring, ': ', clf.best_score_)

Best params:  {'C': 1, 'penalty': 'l2'}
Best test accuracy :  0.706981317601


In [15]:
best_logistic_count = clf.best_estimator_

## 1.2. MLPClssifier
은닉층의 사이즈를 다음과 같이 조절해봅시다.
- 은닉층 1개 (노드 수 = 100)
- 은닉층 2개 (노드 수 = 100)

In [16]:
model = MLPClassifier(learning_rate_init=0.01, max_iter=300)

hidden_layer_sizes_set = [(100,), (100, 100)]
param_grid = dict(hidden_layer_sizes=hidden_layer_sizes_set)

In [17]:
# Using count vectorizer
clf = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=num_folds, n_jobs=-1, verbose=1)
clf.fit(X_train_count, Y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    7.3s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.01, max_iter=300, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'hidden_layer_sizes': [(100,), (100, 100)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=1)

In [18]:
clf.cv_results_

{'mean_fit_time': array([ 5.21074781,  5.4507545 ]),
 'mean_score_time': array([ 0.01862831,  0.01564145]),
 'mean_test_score': array([ 0.71337266,  0.69419862]),
 'mean_train_score': array([ 0.96817161,  0.95083109]),
 'param_hidden_layer_sizes': masked_array(data = [(100,) (100, 100)],
              mask = [False False],
        fill_value = ?),
 'params': ({'hidden_layer_sizes': (100,)},
  {'hidden_layer_sizes': (100, 100)}),
 'rank_test_score': array([1, 2], dtype=int32),
 'split0_test_score': array([ 0.70833333,  0.7254902 ]),
 'split0_train_score': array([ 0.97416974,  0.97539975]),
 'split1_test_score': array([ 0.73529412,  0.67892157]),
 'split1_train_score': array([ 0.9698647 ,  0.96494465]),
 'split2_test_score': array([ 0.70761671,  0.63390663]),
 'split2_train_score': array([ 0.97541487,  0.86170867]),
 'split3_test_score': array([ 0.7044335 ,  0.70935961]),
 'split3_train_score': array([ 0.97481572,  0.97481572]),
 'split4_test_score': array([ 0.71111111,  0.72345679]),
 '

In [19]:
print("Best params: ", clf.best_params_)
print("Best test", scoring, ': ', clf.best_score_)

Best params:  {'hidden_layer_sizes': (100,)}
Best test accuracy :  0.7133726647


In [20]:
best_mlp_count = clf.best_estimator_

## 1.3. 두 모델의 비교
Logistic regression에서 가장 성능이 좋은 모델과 MLP에서 가장 성능이 좋은 모델을 선택하여 테스트 데이터에 대한 성능 비교

In [22]:
best_models_count = []
best_models_count.append(('LogisticRegression', best_logistic_count))
best_models_count.append(('MLPClassifier', best_mlp_count))

In [23]:
results = []
scores  = []
names   = []
for name, model in best_models_count:
    Y_test_hat = model.predict(X_test_count)
    results.append(metrics.confusion_matrix(Y_test, Y_test_hat))
    scores.append(metrics.accuracy_score(Y_test, Y_test_hat))
    names.append(name)

In [24]:
for name, score, cm in list(zip(names, scores, results)):
    print('\n[%s]' % name)
    print('- test accuracy: %f' % score)
    print('- confusion matrix :\n', cm)


[LogisticRegression]
- test accuracy: 0.660754
- confusion matrix :
 [[170  17  42  90]
 [ 20 310  46  13]
 [ 40  38 298  18]
 [ 88  19  28 116]]

[MLPClassifier]
- test accuracy: 0.645972
- confusion matrix :
 [[174  20  32  93]
 [ 25 307  44  13]
 [ 40  41 271  42]
 [ 97  17  15 122]]


# 2. Fitting classifiers with tf-idf vectorizer

In [25]:
# Pre-define options
num_folds = 5
num_instances = len(X_train)
seed = 1234
scoring = 'accuracy'

## 2.1. Logistic Regression
다음과 같은 파라미터를 컨트롤하여 모델링해봅시다.
- regulatization: L1, L2
- C

In [45]:
model = LogisticRegression()

penalty_set = ['l1', 'l2']
C_set = [0.1, 1, 10, 100]
param_grid = dict(penalty=penalty_set, C=C_set)

In [27]:
# Using count vectorizer
clf = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=num_folds, n_jobs=-1, verbose=1)
clf.fit(X_train_tfidf, Y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.3s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=1)

In [28]:
clf.cv_results_

{'mean_fit_time': array([ 0.09383607,  0.11490035,  0.11884079,  0.11128392]),
 'mean_score_time': array([ 0.00117226,  0.00099206,  0.00084963,  0.00052657]),
 'mean_test_score': array([ 0.69469027,  0.73107178,  0.71976401,  0.73254671]),
 'mean_train_score': array([ 0.78822366,  0.85533489,  0.97197657,  0.94997574]),
 'param_C': masked_array(data = [1 1 10 10],
              mask = [False False False False],
        fill_value = ?),
 'param_penalty': masked_array(data = ['l1' 'l2' 'l1' 'l2'],
              mask = [False False False False],
        fill_value = ?),
 'params': ({'C': 1, 'penalty': 'l1'},
  {'C': 1, 'penalty': 'l2'},
  {'C': 10, 'penalty': 'l1'},
  {'C': 10, 'penalty': 'l2'}),
 'rank_test_score': array([4, 2, 3, 1], dtype=int32),
 'split0_test_score': array([ 0.70588235,  0.75490196,  0.73284314,  0.75      ]),
 'split0_train_score': array([ 0.78474785,  0.85670357,  0.96924969,  0.94649446]),
 'split1_test_score': array([ 0.68382353,  0.71323529,  0.71568627,  0.7303

In [29]:
print("Best params: ", clf.best_params_)
print("Best test", scoring, ': ', clf.best_score_)

Best params:  {'C': 10, 'penalty': 'l2'}
Best test accuracy :  0.732546705998


In [30]:
best_logistic_tfidf = clf.best_estimator_

## 2.2. MLPClssifier
은닉층의 사이즈를 다음과 같이 조절해봅시다.
- 은닉층 1개 (노드 수 = 100)
- 은닉층 2개 (노드 수 = 100)

In [31]:
model = MLPClassifier(learning_rate_init=0.01, max_iter=300)

hidden_layer_sizes_set = [(100,), (100, 100)]
param_grid = dict(hidden_layer_sizes=hidden_layer_sizes_set)

In [32]:
# Using count vectorizer
clf = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=num_folds, n_jobs=-1, verbose=1)
clf.fit(X_train_tfidf, Y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.5s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.01, max_iter=300, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'hidden_layer_sizes': [(100,), (100, 100)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=1)

In [33]:
clf.cv_results_

{'mean_fit_time': array([ 4.67469234,  3.7807632 ]),
 'mean_score_time': array([ 0.00812488,  0.01117382]),
 'mean_test_score': array([ 0.72173058,  0.72271386]),
 'mean_train_score': array([ 0.97603266,  0.97603266]),
 'param_hidden_layer_sizes': masked_array(data = [(100,) (100, 100)],
              mask = [False False],
        fill_value = ?),
 'params': ({'hidden_layer_sizes': (100,)},
  {'hidden_layer_sizes': (100, 100)}),
 'rank_test_score': array([2, 1], dtype=int32),
 'split0_test_score': array([ 0.73039216,  0.73284314]),
 'split0_train_score': array([ 0.97539975,  0.97539975]),
 'split1_test_score': array([ 0.70588235,  0.70588235]),
 'split1_train_score': array([ 0.97785978,  0.97785978]),
 'split2_test_score': array([ 0.71253071,  0.72481572]),
 'split2_train_score': array([ 0.9760295,  0.9760295]),
 'split3_test_score': array([ 0.72660099,  0.71428571]),
 'split3_train_score': array([ 0.97420147,  0.97420147]),
 'split4_test_score': array([ 0.73333333,  0.73580247]),
 'sp

In [34]:
print("Best params: ", clf.best_params_)
print("Best test", scoring, ': ', clf.best_score_)

Best params:  {'hidden_layer_sizes': (100, 100)}
Best test accuracy :  0.722713864307


In [35]:
best_mlp_tfidf = clf.best_estimator_

## 2.3. 두 모델의 비교
Logistic regression에서 가장 성능이 좋은 모델과 MLP에서 가장 성능이 좋은 모델을 선택하여 테스트 데이터에 대한 성능 비교

In [39]:
best_models = []
best_models.append(('LogisticRegression', best_logistic_tfidf))
best_models.append(('MLPClassifier', best_mlp_tfidf))

In [40]:
results = []
scores  = []
names   = []
for name, model in best_models:
    Y_test_hat = model.predict(X_test_tfidf)
    results.append(metrics.confusion_matrix(Y_test, Y_test_hat))
    scores.append(metrics.accuracy_score(Y_test, Y_test_hat))
    names.append(name)

In [41]:
for name, score, cm in list(zip(names, scores, results)):
    print('\n[%s]' % name)
    print('- test accuracy: %f' % score)
    print('- confusion matrix :\n', cm)


[LogisticRegression]
- test accuracy: 0.691796
- confusion matrix :
 [[178  15  44  82]
 [ 12 319  47  11]
 [ 32  32 313  17]
 [ 83  13  29 126]]

[MLPClassifier]
- test accuracy: 0.667406
- confusion matrix :
 [[158  20  26 115]
 [ 16 318  40  15]
 [ 30  35 278  51]
 [ 72  14  16 149]]
