In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV

In [2]:
# Load training set and test set
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
newsgroups_test  = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
X_train = newsgroups_train.data
Y_train = newsgroups_train.target
X_test  = newsgroups_test.data
Y_test  = newsgroups_test.target

In [3]:
print(X_test[0:3])
print(Y_test[0:3])

['TRry the SKywatch project in  Arizona.', 'The Vatican library recently made a tour of the US.\n Can anyone help me in finding a FTP site where this collection is \n available.', 'Hi there,\n\nI am here looking for some help.\n\nMy friend is a interior decor designer. He is from Thailand. He is\ntrying to find some graphics software on PC. Any suggestion on which\nsoftware to buy,where to buy and how much it costs ? He likes the most\nsophisticated \nsoftware(the more features it has,the better)']
[2 1 1]


In [4]:
# Declare two vectorizers
count_vectorizer = CountVectorizer(min_df=40)
tfidf_vectorizer = TfidfVectorizer(min_df=40)

In [5]:
# Fitting vectorizers to the training set
count_vectorizer = count_vectorizer.fit(X_train)
tfidf_vectorizer = tfidf_vectorizer.fit(X_train)

In [6]:
# Transform X_train and X_test using 2 vectorizers
X_train_count = count_vectorizer.transform(X_train)
X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_count  = count_vectorizer.transform(X_test)
X_test_tfidf  = tfidf_vectorizer.transform(X_test)

In [7]:
X_train_count.shape

(2034, 758)

In [8]:
X_train_tfidf.shape

(2034, 758)

In [9]:
print(X_train_count[0,:])

  (0, 18)	1
  (0, 19)	1
  (0, 29)	1
  (0, 35)	1
  (0, 50)	1
  (0, 55)	2
  (0, 61)	2
  (0, 78)	1
  (0, 87)	1
  (0, 113)	1
  (0, 195)	2
  (0, 244)	6
  (0, 253)	1
  (0, 256)	2
  (0, 260)	1
  (0, 272)	1
  (0, 292)	1
  (0, 301)	1
  (0, 316)	2
  (0, 321)	3
  (0, 329)	2
  (0, 337)	3
  (0, 342)	1
  (0, 354)	1
  (0, 374)	1
  (0, 420)	1
  (0, 453)	1
  (0, 455)	1
  (0, 472)	1
  (0, 503)	1
  (0, 533)	1
  (0, 556)	1
  (0, 559)	1
  (0, 647)	2
  (0, 648)	7
  (0, 649)	1
  (0, 657)	1
  (0, 662)	1
  (0, 671)	4
  (0, 703)	1
  (0, 720)	1
  (0, 729)	1
  (0, 733)	1
  (0, 755)	3
  (0, 756)	1


In [10]:
print(X_train_tfidf[0,:])

  (0, 756)	0.0721155370883
  (0, 755)	0.14831743135
  (0, 733)	0.0552321132408
  (0, 729)	0.0869038419929
  (0, 720)	0.0779953088406
  (0, 703)	0.0899136950299
  (0, 671)	0.144021224217
  (0, 662)	0.0499187101099
  (0, 657)	0.0631328198798
  (0, 649)	0.0804180650654
  (0, 648)	0.23180983825
  (0, 647)	0.0836810249613
  (0, 559)	0.093868888692
  (0, 556)	0.136566528013
  (0, 533)	0.096000080666
  (0, 503)	0.12460572882
  (0, 472)	0.0753113616121
  (0, 455)	0.106203923252
  (0, 453)	0.0524819891906
  (0, 420)	0.13397450603
  (0, 374)	0.0705501151537
  (0, 354)	0.0720485235024
  (0, 342)	0.0423196121842
  (0, 337)	0.122195764007
  (0, 329)	0.210611295123
  (0, 321)	0.122719069678
  (0, 316)	0.109162363642
  (0, 301)	0.117608096826
  (0, 292)	0.0509033565847
  (0, 272)	0.105976622957
  (0, 260)	0.0599235329153
  (0, 256)	0.252864456363
  (0, 253)	0.0458215515453
  (0, 244)	0.675533177371
  (0, 195)	0.153717311712
  (0, 113)	0.0527484422433
  (0, 87)	0.0492024228549
  (0, 78)	0.106433094352

# 1. Fitting classifiers with count vectorizer

In [11]:
# Pre-define options
num_folds = 5
num_instances = len(X_train)
seed = 1234
scoring = 'accuracy'

## 1.1. Logistic Regression
다음과 같은 파라미터를 컨트롤하여 모델링해봅시다.
- regulatization: L1, L2
- C

In [12]:
model = LogisticRegression()

penalty_set = ['l1', 'l2']
C_set = [1, 10]
param_grid = dict(penalty=penalty_set, C=C_set)

In [13]:
# Using count vectorizer

clf = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=num_folds, n_jobs=-1, verbose=1)
# clf = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=num_folds, n_jobs=4, verbose=1)
clf.fit(X_train_count, Y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:  5.2min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'penalty': ['l1', 'l2'], 'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=1)

In [14]:
clf.cv_results_

{'mean_fit_time': array([   2.3607635 ,    1.71480956,  227.77055197,    1.27476921]),
 'mean_score_time': array([ 0.00256724,  0.0089066 ,  0.00244021,  0.00088606]),
 'mean_test_score': array([ 0.70648968,  0.70698132,  0.6833825 ,  0.68731563]),
 'mean_train_score': array([ 0.92723698,  0.95341819,  0.97209881,  0.97308252]),
 'param_C': masked_array(data = [1 1 10 10],
              mask = [False False False False],
        fill_value = ?),
 'param_penalty': masked_array(data = ['l1' 'l2' 'l1' 'l2'],
              mask = [False False False False],
        fill_value = ?),
 'params': ({'C': 1, 'penalty': 'l1'},
  {'C': 1, 'penalty': 'l2'},
  {'C': 10, 'penalty': 'l1'},
  {'C': 10, 'penalty': 'l2'}),
 'rank_test_score': array([2, 1, 4, 3], dtype=int32),
 'split0_test_score': array([ 0.72303922,  0.73284314,  0.69117647,  0.70343137]),
 'split0_train_score': array([ 0.92558426,  0.95448954,  0.9704797 ,  0.97170972]),
 'split1_test_score': array([ 0.70098039,  0.70098039,  0.69852941,

In [15]:
print("Best params: ", clf.best_params_)
print("Best test", scoring, ': ', clf.best_score_)

Best params:  {'C': 1, 'penalty': 'l2'}
Best test accuracy :  0.706981317601


In [16]:
best_logistic_count = clf.best_estimator_

## 1.2. MLPClssifier
은닉층의 사이즈를 조절
- 은닉층 1개 (노드 수 = 100)
- 은닉층 2개 (노드 수 = 100)

In [17]:
model = MLPClassifier(learning_rate_init=0.01, max_iter=300)

hidden_layer_sizes_set = [(100,), (100, 100)]
param_grid = dict(hidden_layer_sizes=hidden_layer_sizes_set)

In [19]:
# Using count vectorizer
clf = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=num_folds, n_jobs=-1, verbose=1)
clf.fit(X_train_count, Y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.5s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.01, max_iter=300, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'hidden_layer_sizes': [(100,), (100, 100)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=1)

In [20]:
clf.cv_results_

{'mean_fit_time': array([ 1.60793633,  2.81322122]),
 'mean_score_time': array([ 0.00458269,  0.01007352]),
 'mean_test_score': array([ 0.71042281,  0.71927237]),
 'mean_train_score': array([ 0.97271533,  0.97480514]),
 'param_hidden_layer_sizes': masked_array(data = [(100,) (100, 100)],
              mask = [False False],
        fill_value = ?),
 'params': ({'hidden_layer_sizes': (100,)},
  {'hidden_layer_sizes': (100, 100)}),
 'rank_test_score': array([2, 1], dtype=int32),
 'split0_test_score': array([ 0.70588235,  0.73529412]),
 'split0_train_score': array([ 0.97539975,  0.97539975]),
 'split1_test_score': array([ 0.72794118,  0.71323529]),
 'split1_train_score': array([ 0.97724477,  0.97724477]),
 'split2_test_score': array([ 0.71498771,  0.71253071]),
 'split2_train_score': array([ 0.97111248,  0.97787339]),
 'split3_test_score': array([ 0.69211823,  0.72906404]),
 'split3_train_score': array([ 0.96621622,  0.97420147]),
 'split4_test_score': array([ 0.71111111,  0.70617284]),
 '

In [21]:
print("Best params: ", clf.best_params_)
print("Best test", scoring, ': ', clf.best_score_)

Best params:  {'hidden_layer_sizes': (100, 100)}
Best test accuracy :  0.719272369715


In [22]:
best_mlp_count = clf.best_estimator_

## 1.3. 두 모델의 비교
Logistic regression에서 가장 성능이 좋은 모델과 MLP에서 가장 성능이 좋은 모델을 선택하여 테스트 데이터에 대한 성능 비교

In [23]:
best_models_count = []
best_models_count.append(('LogisticRegression', best_logistic_count))
best_models_count.append(('MLPClassifier', best_mlp_count))

In [24]:
results = []
scores  = []
names   = []
for name, model in best_models_count:
    Y_test_hat = model.predict(X_test_count)
    results.append(metrics.confusion_matrix(Y_test, Y_test_hat))
    scores.append(metrics.accuracy_score(Y_test, Y_test_hat))
    names.append(name)

In [25]:
for name, score, cm in list(zip(names, scores, results)):
    print('\n[%s]' % name)
    print('- test accuracy: %f' % score)
    print('- confusion matrix :\n', cm)


[LogisticRegression]
- test accuracy: 0.660754
- confusion matrix :
 [[170  17  42  90]
 [ 19 310  47  13]
 [ 40  38 298  18]
 [ 88  19  28 116]]

[MLPClassifier]
- test accuracy: 0.642276
- confusion matrix :
 [[212   9  28  70]
 [ 21 295  52  21]
 [ 62  28 270  34]
 [134  13  12  92]]


# 2. Fitting classifiers with tf-idf vectorizer

In [26]:
# Pre-define options
num_folds = 5
num_instances = len(X_train)
seed = 1234
scoring = 'accuracy'

## 2.1. Logistic Regression
다음과 같은 파라미터를 컨트롤하여 모델링해봅시다.
- regulatization: L1, L2
- C

In [27]:
model = LogisticRegression()

penalty_set = ['l1', 'l2']
C_set = [1, 10]
param_grid = dict(penalty=penalty_set, C=C_set)

In [28]:
# Using count vectorizer

clf = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=num_folds, n_jobs=-1, verbose=1)
clf.fit(X_train_count, Y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  3.2min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=1)

In [29]:
clf.cv_results_

{'mean_fit_time': array([  0.55976553,   0.41338439,  71.22979593,   0.62612696]),
 'mean_score_time': array([ 0.0009469 ,  0.00082145,  0.000807  ,  0.00080428]),
 'mean_test_score': array([ 0.70698132,  0.70698132,  0.68682399,  0.68731563]),
 'mean_train_score': array([ 0.92797431,  0.95341819,  0.97209881,  0.97308252]),
 'param_C': masked_array(data = [1 1 10 10],
              mask = [False False False False],
        fill_value = ?),
 'param_penalty': masked_array(data = ['l1' 'l2' 'l1' 'l2'],
              mask = [False False False False],
        fill_value = ?),
 'params': ({'C': 1, 'penalty': 'l1'},
  {'C': 1, 'penalty': 'l2'},
  {'C': 10, 'penalty': 'l1'},
  {'C': 10, 'penalty': 'l2'}),
 'rank_test_score': array([1, 1, 4, 3], dtype=int32),
 'split0_test_score': array([ 0.7254902 ,  0.73284314,  0.69852941,  0.70343137]),
 'split0_train_score': array([ 0.92681427,  0.95448954,  0.9704797 ,  0.97170972]),
 'split1_test_score': array([ 0.70833333,  0.70098039,  0.69607843,  0.

In [30]:
print("Best params: ", clf.best_params_)
print("Best test", scoring, ': ', clf.best_score_)

Best params:  {'C': 1, 'penalty': 'l1'}
Best test accuracy :  0.706981317601


In [31]:
best_logistic_count = clf.best_estimator_

## 2.2. MLPClssifier
은닉층의 사이즈를 조절
- 은닉층 1개 (노드 수 = 100)
- 은닉층 2개 (노드 수 = 100)

In [32]:
model = MLPClassifier(learning_rate_init=0.01, max_iter=300)

hidden_layer_sizes_set = [(100,), (100, 100)]
param_grid = dict(hidden_layer_sizes=hidden_layer_sizes_set)

In [33]:
# Using count vectorizer
clf = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=num_folds, n_jobs=-1, verbose=1)
clf.fit(X_train_tfidf, Y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   12.2s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.01, max_iter=300, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'hidden_layer_sizes': [(100,), (100, 100)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=1)

In [34]:
clf.cv_results_

{'mean_fit_time': array([ 4.26198258,  4.68381529]),
 'mean_score_time': array([ 0.00815945,  0.01067963]),
 'mean_test_score': array([ 0.72222222,  0.72271386]),
 'mean_train_score': array([ 0.96976285,  0.97578726]),
 'param_hidden_layer_sizes': masked_array(data = [(100,) (100, 100)],
              mask = [False False],
        fill_value = ?),
 'params': ({'hidden_layer_sizes': (100,)},
  {'hidden_layer_sizes': (100, 100)}),
 'rank_test_score': array([2, 1], dtype=int32),
 'split0_test_score': array([ 0.72058824,  0.74754902]),
 'split0_train_score': array([ 0.97293973,  0.97539975]),
 'split1_test_score': array([ 0.74264706,  0.71078431]),
 'split1_train_score': array([ 0.97293973,  0.97785978]),
 'split2_test_score': array([ 0.6977887 ,  0.72235872]),
 'split2_train_score': array([ 0.95328826,  0.97787339]),
 'split3_test_score': array([ 0.71674877,  0.71674877]),
 'split3_train_score': array([ 0.97420147,  0.97174447]),
 'split4_test_score': array([ 0.73333333,  0.71604938]),
 '

In [35]:
print("Best params: ", clf.best_params_)
print("Best test", scoring, ': ', clf.best_score_)

Best params:  {'hidden_layer_sizes': (100, 100)}
Best test accuracy :  0.722713864307


In [36]:
best_mlp_count = clf.best_estimator_

## 2.3. 두 모델의 비교
Logistic regression에서 가장 성능이 좋은 모델과 MLP에서 가장 성능이 좋은 모델을 선택하여 테스트 데이터에 대한 성능 비교

In [37]:
best_models_count = []
best_models_count.append(('LogisticRegression', best_logistic_count))
best_models_count.append(('MLPClassifier', best_mlp_count))

In [38]:
results = []
scores  = []
names   = []
for name, model in best_models_count:
    Y_test_hat = model.predict(X_test_count)
    results.append(metrics.confusion_matrix(Y_test, Y_test_hat))
    scores.append(metrics.accuracy_score(Y_test, Y_test_hat))
    names.append(name)

In [39]:
for name, score, cm in list(zip(names, scores, results)):
    print('\n[%s]' % name)
    print('- test accuracy: %f' % score)
    print('- confusion matrix :\n', cm)


[LogisticRegression]
- test accuracy: 0.669623
- confusion matrix :
 [[166  16  46  91]
 [ 18 317  45   9]
 [ 37  29 313  15]
 [ 97  21  23 110]]

[MLPClassifier]
- test accuracy: 0.658537
- confusion matrix :
 [[181  25  29  84]
 [ 14 315  48  12]
 [ 31  53 286  24]
 [102  28  12 109]]
