In [3]:
from sklearn import svm
from sklearn import datasets
import pickle
from sklearn.externals import joblib
import numpy as np
from sklearn import random_projection
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
#http://scikit-learn.org/stable/tutorial/basic/tutorial.html
#http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [4]:
iris = datasets.load_iris()
digits = datasets.load_digits()

In [5]:
print(digits.data)

[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]


In [6]:
digits.target

array([0, 1, 2, ..., 8, 9, 8])

Learning and predicting

In [7]:
#an estimator for classification is a Python object that implements the methods fit(X, y) and predict(T).
clf = svm.SVC(gamma=0.001, C=100.)
#In this example we set the value of gamma manually. It is possible to automatically find good values for 
#the parameters by using tools such as grid search and cross validation.

In [8]:
clf.fit(digits.data[:-1], digits.target[:-1])

SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [9]:
#Now you can predict new values, in particular, we can ask to the classifier what is the digit 
#of our last image in the digits dataset, which we have not used to train the classifier:
clf.predict(digits.data[-1:])

array([8])

Model persistence

In [10]:
#It is possible to save a model in the scikit by using Python’s built-in persistence model, namely pickle
clf = svm.SVC()
iris = datasets.load_iris()
X, y = iris.data, iris.target
clf.fit(X, y)  

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [11]:
s = pickle.dumps(clf)
clf2 = pickle.loads(s)
clf2.predict(X[0:1])

array([0])

In [12]:
y[0]

0

In [13]:
#In the specific case of the scikit, it may be more interesting to use joblib’s replacement of pickle 
#(joblib.dump & joblib.load), 
#which is more efficient on big data, but can only pickle to the disk and not to a string:

In [14]:
joblib.dump(clf, 'filename.pkl') 

['filename.pkl']

In [15]:
clf = joblib.load('filename.pkl')

Conventions

In [16]:
#input will be cast to float64:
rng = np.random.RandomState(0)
X = rng.rand(10, 2000)
X = np.array(X, dtype='float32')
X.dtype

dtype('float32')

In [17]:
transformer = random_projection.GaussianRandomProjection()
X_new = transformer.fit_transform(X)
X_new.dtype

dtype('float64')

In [18]:
iris = datasets.load_iris()
clf = SVC()
clf.fit(iris.data, iris.target)  

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [19]:
list(clf.predict(iris.data[:3]))

[0, 0, 0]

In [20]:
clf.fit(iris.data, iris.target_names[iris.target])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
list(clf.predict(iris.data[:3]))
#Here, the first predict() returns an integer array, since iris.target (an integer array) was used in fit. 
#The second predict() returns a string array, since iris.target_names was for fitting.

['setosa', 'setosa', 'setosa']

In [22]:
rng = np.random.RandomState(0)
X = rng.rand(100, 10)
y = rng.binomial(1, 0.5, 100)
X_test = rng.rand(5, 10)

In [23]:
clf = SVC()
clf.set_params(kernel='linear').fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [24]:
clf.predict(X_test)

array([1, 0, 1, 1, 0])

In [25]:
clf.set_params(kernel='rbf').fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [26]:
clf.predict(X_test)

array([0, 0, 0, 1, 0])

In [27]:
#When using multiclass classifiers, the learning and prediction task that 
#is performed is dependent on the format of the target data fit upon:
X = [[1, 2], [2, 4], [4, 5], [3, 2], [3, 1]]
y = [0, 0, 1, 1, 2]

classif = OneVsRestClassifier(estimator=SVC(random_state=0))
classif.fit(X, y).predict(X)

array([0, 0, 1, 1, 2])

In [28]:
#In the above case, the classifier is fit on a 1d array of multiclass labels and the predict() method therefore 
#provides corresponding multiclass predictions. It is also possible to fit upon a 2d array of binary label indicators:
y = LabelBinarizer().fit_transform(y)
classif.fit(X, y).predict(X)

array([[1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [29]:
y = [[0, 1], [0, 2], [1, 3], [0, 2, 3], [2, 4]]
y = MultiLabelBinarizer().fit_transform(y)
classif.fit(X, y).predict(X)

array([[1, 1, 0, 0, 0],
       [1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0],
       [1, 0, 1, 0, 0],
       [1, 0, 1, 0, 0]])

Loading the 20 newsgroups dataset

In [36]:
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

In [38]:
twenty_train = fetch_20newsgroups(subset='train',
    categories=categories, shuffle=True, random_state=42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [39]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [40]:
#The files themselves are loaded in memory in the data attribute. For reference the filenames are also available:
len(twenty_train.data)

2257

In [41]:
len(twenty_train.filenames)

2257

In [42]:
#Let’s print the first lines of the first loaded file:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [43]:
print(twenty_train.target_names[twenty_train.target[0]])

comp.graphics


In [44]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2], dtype=int64)

In [45]:
#It is possible to get back the category names as follows:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


Extracting features from text files

In [1]:
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

In [4]:
twenty_train= fetch_20newsgroups(subset = 'train', categories = categories, shuffle = True, random_state=42)

In [6]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [7]:
len(twenty_train.data)

2257

In [8]:
print("\n".join(twenty_train.data[0].split("\m")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.



In [10]:
print(twenty_train.target_names[twenty_train.target[0]])

comp.graphics


In [11]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2], dtype=int64)

In [12]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


In [13]:
count_vect = CountVectorizer()

In [14]:
x_train_counts = count_vect.fit_transform(twenty_train.data)

In [15]:
x_train_counts.shape

(2257, 35788)

In [16]:
count_vect.vocabulary_.get(u'algorithm')

4690

In [23]:
tfidf_transformer = TfidfTransformer()

In [24]:
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

In [25]:
x_train_tfidf.shape

(2257, 35788)

In [None]:
clf = MultinomialNB

In [None]:


   par = {'vect__ngram_range': [(1, 1), (1, 2)],}
    search = GridSearchCV(pipe, par, n_jobs=-1)
    search.fit(docs_train, y_train)
    
    n_candidates = len(grid_search.cv_results_['params'])
    for i in range(n_candidates):
        print(i, 'params - %s; mean - %0.2f; std - %0.2f'% (search.cv_results_['params'][i],
                    search.cv_results_['mean_test_score'][i],search.cv_results_['std_test_score'][i]))

y_predicted = grid_search.predict(docs_test)