# Classify
Supervised machine learning example: linear classification with `scikit-learn` and `pandas`.

In [1]:
from classify import Classifier
from tools import *

## get example data
Normalize and partition into training and testing datasets.

In [2]:
data = irisdata()
data = zscores(data)
trainrows, testrows = datasplit(data, 100)

In [3]:
trainrows.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
144,1.034539,0.556746,1.100097,1.706379,virginica
145,1.034539,-0.131539,0.816859,1.443994,virginica
147,0.793012,-0.131539,0.816859,1.050416,virginica
148,0.430722,0.786174,0.930154,1.443994,virginica
149,0.068433,-0.131539,0.760211,0.788031,virginica


In [4]:
testrows.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
132,0.672249,-0.590395,1.04345,1.312801,virginica
135,2.242172,-0.131539,1.326688,1.443994,virginica
136,0.551486,0.786174,1.04345,1.575187,virginica
138,0.189196,-0.131539,0.590269,0.788031,virginica
146,0.551486,-1.27868,0.703564,0.919223,virginica


## train a Classifier object
Input training data and the name of the column to predict.

In [5]:
classy = Classifier(trainrows, 'species')
classy

Classifier(LogisticRegression)

In [6]:
classy.coefs

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
setosa,-1.004604,1.206704,-1.656655,-1.549015
versicolor,0.391944,-0.33849,-0.332337,-0.652197
virginica,0.61266,-0.868214,1.988992,2.201212


In [7]:
classy.classes

['setosa', 'versicolor', 'virginica']

In [8]:
classy.features

['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

In [9]:
classy.model

In [10]:
classy.params

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [11]:
classy.target

'species'

## predict classes
Call with new data to assign each row a predicted class.

In [12]:
cats = classy(testrows)
cats.tail()

132    virginica
135    virginica
136    virginica
138    virginica
146    virginica
Name: predicted, dtype: category
Categories (3, object): ['setosa', 'versicolor', 'virginica']

## predict class probabilities
*Caution:* Not all models can do this.

In [13]:
probs = classy.probs(testrows)
probs.round(2).tail()

Unnamed: 0,setosa,versicolor,virginica
132,0.0,0.04,0.96
135,0.0,0.01,0.99
136,0.0,0.04,0.96
138,0.01,0.43,0.56
146,0.0,0.17,0.83


## test with different models and parameters
Show a [confusion matrix] to compare test outputs versus reality.

[confusion matrix]: https://en.wikipedia.org/wiki/Confusion_matrix

In [14]:
Classifier(trainrows, 'species').confusion(testrows)

predicted,setosa,versicolor,virginica
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,16,0,0
versicolor,0,16,1
virginica,0,1,16


In [15]:
Classifier(trainrows, 'species', solver='liblinear').confusion(testrows)

predicted,setosa,versicolor,virginica
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,15,1,0
versicolor,0,17,0
virginica,0,1,16


In [16]:
Classifier(trainrows, 'species', model='RidgeClassifier').confusion(testrows)

predicted,setosa,versicolor,virginica
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,15,1,0
versicolor,0,15,2
virginica,0,1,16


In [17]:
params = {
    'model': 'SGDClassifier',
    'loss': 'log_loss',
    'penalty': 'elasticnet',
    'l1_ratio': 0.5,
}
Classifier(trainrows, 'species', **params).confusion(testrows)

predicted,setosa,versicolor,virginica
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,16,0,0
versicolor,4,9,4
virginica,0,0,17


## help

In [18]:
help(Classifier)

Help on class Classifier in module classify:

class Classifier(builtins.object)
 |  Classifier(data, target, model='LogisticRegression', **kwargs)
 |  
 |  Use a scikit-learn classifier with pandas DataFrames.
 |  Input training data to create and train a model.
 |  Call with new feature data to predict classes.
 |  Output is a Series with datatype 'category'.
 |  
 |  Constructor inputs:
 |      data    DataFrame: observations to use for training
 |      target  string: name of column to predict
 |      model   optional str: name of an sklearn.linear_model
 |      kwargs  are passed to the selected sklearn.linear_model
 |  
 |  Call inputs:
 |      data    DataFrame: features to use for prediction
 |  
 |  Methods defined here:
 |  
 |  __call__(self, data)
 |      Call self as a function.
 |  
 |  __init__(self, data, target, model='LogisticRegression', **kwargs)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __repr__(self)
 |      Return repr(self).
