In [5]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [8]:
from sklearn.model_selection import KFold, train_test_split

In [6]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
x = np.array(twenty_train.data)
y = np.array(twenty_train.target)

In [7]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                   ])

In [11]:
kf = KFold(n_splits=10)

In [26]:
scores = []
for train_index, val_index in kf.split(x):
#    print(train_index)
    x_train = x[train_index]
    y_train = y[train_index]
    x_val = x[val_index]
    y_val = y[val_index]
    text_clf.fit(x_train, y_train)
    training_score = text_clf.score(x_train, y_train)
    validation_score = text_clf.score(x_val, y_val)
    print("Training Score: {:0.3f}       Validation Score: {:0.3f}".format(training_score, validation_score))
    scores.append((training_score, validation_score))
scores = np.array(scores)
print("Training Mean: {:0.3f}     Validation Mean: {:0.3f}".format(scores[:, 0].mean(), scores[:, 1].mean()))

Training Score: 0.961       Validation Score: 0.894
Training Score: 0.965       Validation Score: 0.912
Training Score: 0.966       Validation Score: 0.947
Training Score: 0.965       Validation Score: 0.925
Training Score: 0.961       Validation Score: 0.898
Training Score: 0.966       Validation Score: 0.925
Training Score: 0.956       Validation Score: 0.938
Training Score: 0.963       Validation Score: 0.916
Training Score: 0.957       Validation Score: 0.902
Training Score: 0.961       Validation Score: 0.929
Training Mean: 0.962     Validation Mean: 0.918


### Cross Validation Method in Sklearn

In [1]:
from sklearn.model_selection import cross_val_score

In [11]:
validation_scores = cross_val_score(text_clf, x, y, cv=10)
validation_scores

array([ 0.89427313,  0.91629956,  0.92951542,  0.92951542,  0.90222222,
        0.90666667,  0.92      ,  0.94222222,  0.92888889,  0.93303571])

In [12]:
validation_scores.mean()

0.92026392385147882

### Exercises

#### 1) Perform the cross validation on the movie review dataset. What is the validation score?

#### 2) Instead of a cross validation technique, what is the validation score when using ShuffleSplit?
####    What is the difference between ShuffleSplit and Cross Validation?

In [13]:
from sklearn.model_selection import ShuffleSplit

#### 3) What is the validation score on the movie review dataset?