# Extending `pandas` to other ecosystems

In [1]:
    import pandas

In [7]:
    import sklearn.datasets, sklearn.discriminant_analysis, sklearn.model_selection, sklearn.metrics, sklearn.tree
    from toolz.curried import *; from toolz.curried.operator import *

Let's use the iris example.  _I'm sorry 😳._

In [3]:
    iris = sklearn.datasets.load_iris()

Create a tidy dataframe with testing and training indexes.

In [4]:
    df = pandas.DataFrame(
        iris['data'],
        pandas.CategoricalIndex(
            pandas.Index(iris['target']).map(
                iris['target_names'].__getitem__
            ), iris['target_names'], name='targets'),
        iris['feature_names']
    ).pipe(
        lambda df: pandas.concat(dict(zip(('train', 'test'), sklearn.model_selection.train_test_split(df))))
    )
    df.sample(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
Unnamed: 0_level_1,targets,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
train,virginica,5.8,2.7,5.1,1.9
test,versicolor,5.9,3.0,4.2,1.5


Let's initialize some models.

In [8]:
    tree = sklearn.tree.DecisionTreeClassifier()
    lda = sklearn.discriminant_analysis.LinearDiscriminantAnalysis()

Train and predict the classes.

In [54]:
    df = df.pipe(
        do(lambda df: lda.fit(df.loc['train'], df.loc['train'].index))
    ).pipe(
        do(lambda df: tree.fit(df.loc['train'], df.loc['train'].index))
    ).pipe(
        lambda df: df
        .set_index(pandas.Index(tree.predict(df), name=tree), append=True)
        .set_index(pandas.Index(lda.predict(df), name=lda), append=True)
    )

In [59]:
    import IPython

In [61]:
    for model in (tree, lda):
        print(model)
        df.loc['test'].index.to_frame().pipe(
            lambda df: pandas.DataFrame(
                sklearn.metrics.confusion_matrix(df['targets'], df[model]),
                iris['target_names'], iris['target_names']
            )
        ).pipe(IPython.display.display)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


Unnamed: 0,setosa,versicolor,virginica
setosa,17,0,0
versicolor,0,10,1
virginica,0,1,9


LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)


Unnamed: 0,setosa,versicolor,virginica
setosa,17,0,0
versicolor,0,10,1
virginica,0,0,10


Using ipwidgets to make your code interactive.

In [9]:
    from ipywidgets import interact

In [72]:
    @interact
    def _(model=['tree','lda'], type=['test', 'train']):
        model = globals().get(model)
        df.loc[type].index.to_frame().pipe(
            lambda df: pandas.DataFrame(
                sklearn.metrics.confusion_matrix(df['targets'], df[model]),
                iris['target_names'], iris['target_names']
            )
        ).pipe(IPython.display.display)

interactive(children=(Dropdown(description='model', options=('tree', 'lda'), value='tree'), Dropdown(descripti…

In [73]:
    import pandas

In [77]:
    pandas.util.testing.makeDataFrame()

Unnamed: 0,A,B,C,D
URMVvHzTA6,1.265084,0.722351,-0.586344,-0.603416
6XtCOmUfOt,-1.481678,1.260293,-0.595652,-0.372425
cu0fBbN14A,-0.310017,0.824252,-0.059986,1.242138
DIfzzttFNA,-0.303413,-0.056332,-0.382758,-2.030718
W9Xmr5MKbO,-0.414613,0.671362,0.197046,0.503501
50z2kFRn6v,-0.617809,-0.359157,1.695765,0.393641
rkGXRAXn6r,-0.987923,1.091318,-1.280359,-0.509785
Bf6WytpGpf,0.068635,-0.085423,0.087884,-0.702358
vUAGFDBvUa,1.093728,-0.219484,1.169825,-0.197037
UhkijDe1QB,0.765616,0.041184,1.055348,0.503691


In [79]:
    random_data = pandas.concat([pandas.util.testing.makeDataFrame() for _ in range(10)])

In [84]:
    import scipy.spatial.distance

In [None]:
    D = pandas.DataFrame(
        scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(random_data)),
        random_data.index, random_data.index
    )

In [None]:
    D.stack().hist()

In [82]:
    random_data.stack() - random_data.unstack()

0FNzMw7Ify  A   NaN
            B   NaN
            C   NaN
            D   NaN
0Z2UvcjZhv  A   NaN
            B   NaN
            C   NaN
            D   NaN
0Z4vQ908jm  A   NaN
            B   NaN
            C   NaN
            D   NaN
0eIe9BHTHr  A   NaN
            B   NaN
            C   NaN
            D   NaN
0jXrfjE8xO  A   NaN
            B   NaN
            C   NaN
            D   NaN
0juxdBnTXx  A   NaN
            B   NaN
            C   NaN
            D   NaN
0s0Am2Q7TM  A   NaN
            B   NaN
            C   NaN
            D   NaN
1IYmooa0lo  A   NaN
            B   NaN
                 ..
xY8wH3lWfh  C   NaN
            D   NaN
yFf7dVblyx  A   NaN
            B   NaN
            C   NaN
            D   NaN
yIEM5G6QQh  A   NaN
            B   NaN
            C   NaN
            D   NaN
yo1Je2ut3n  A   NaN
            B   NaN
            C   NaN
            D   NaN
yqmmr3wzsL  A   NaN
            B   NaN
            C   NaN
            D   NaN
z3kkTGLjl7  A   NaN


In [8]:
    import ibis

    db = ibis.sqlite.connect('idiomatic_pandas.sqlite')

    db.list_tables()
    db.table('urls')['value'].execute()
    db.table('responses')['key'].execute()a
        

  'for %s' % platform.system()


0    1e796e69aee7d5caaa8302de99522689d773d858bdc425...
1    ac00bfb030a3bb94b6bcb6b4e57cc7025d644a6673d15c...
2    f69c4ed7f42635e24d8b0d6e7cf4e15bd2edd8dc46c647...
Name: key, dtype: object