In [None]:
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
import matplotlib
%matplotlib inline

### Import data from csv

In [None]:
deck_data = pd.DataFrame.from_csv('data.csv')

In [None]:
deck_data.head()

In [None]:
deck_data.describe()

In [None]:
print(deck_data['title'].value_counts().head())
print(deck_data['title'].value_counts().tail())

In [None]:
deck_data.groupby('title')['quantity_x'].describe().head(25)

Kind of ugly -- let's make our stuff a little prettier.

This portion taken from Chris Moffitt's "Tips for Customizing Your IPython and Pandas Display" (http://pbpython.com/ipython-pandas-display-tips.html)

In [None]:
CSS = """
body {
    margin: 0;
    font-family: Helvetica;
}
table.dataframe {
    border-collapse: collapse;
    border: none;
}
table.dataframe tr {
    border: none;
}
table.dataframe td, table.dataframe th {
    margin: 0;
    border: 1px solid white;
    padding-left: 0.25em;
    padding-right: 0.25em;
}
table.dataframe th:not(:empty) {
    background-color: #cfebfd ;
    text-align: left;
    font-weight: normal;
}
table.dataframe tr:nth-child(2) th:empty {
    border-left: none;
    border-right: 1px dashed #888;
}
table.dataframe td {
    border: 2px solid #ccf;
    background-color: #f4f4ff;
}
"""

In [None]:
from IPython.core.display import HTML
HTML('<style>{}</style>'.format(CSS))

In [None]:
deck_data.describe()

We're going to transform the data in a little bit, let's set up some sane Pandas defaults

In [None]:
pd.set_option("display.max_rows",10)
pd.set_option("display.max_columns",10)

In [None]:
deck_data

### Data Transformation for tf-idf

Step 1: Prep data for transformation

In [None]:
tf_idf = deck_data.copy()
tf_idf = tf_idf.groupby(['deck_id', 'title'])['quantity_x'].max().unstack()
tf_idf.fillna(0, inplace=True)
tf_idf

Step 2: Create Tfidf Transformer and transform data

In [None]:
tfidf_transformer = TfidfTransformer()
transformed = tfidf_transformer.fit_transform(tf_idf)
transformed.toarray()

Quick dip into the data

In [None]:
tf_idf_transformed = pd.DataFrame(transformed.toarray(), columns = tf_idf.columns)

Let's compare two cards, one commonly seen in decks (Parasite) and the other not (Xanadu)

In [None]:
from IPython.display import Image
Image('http://netrunnerdb.com/bundles/netrunnerdbcards/images/cards/en/01012.png')

In [None]:
Image('http://netrunnerdb.com/bundles/netrunnerdbcards/images/cards/en/02082.png')

In [None]:
tf_idf_transformed['Parasite'].describe()

In [None]:
tf_idf['Parasite'].describe()

In [None]:
tf_idf_transformed['Xanadu'].describe()

In [None]:
tf_idf['Xanadu'].describe()

In [None]:
tf_idf_transformed.sort_values(by='Xanadu', ascending=False).head()

In [None]:
tf_idf_transformed.sort_values(by='Parasite', ascending=False)['Parasite'].head()

In [None]:
tf_idf.iloc[84]

In [None]:
tf_idf.iloc[471]

### Apply K-means clustering

In [None]:
kmeans = KMeans(n_clusters=6, verbose=10)
kmeans.fit(transformed.toarray())

We can identify the labels

In [None]:
kmeans.labels_

In [None]:
len(kmeans.labels_)

In [None]:
len(tf_idf)

In [None]:
labeled_data = tf_idf.copy()
labeled_data.reset_index(inplace=True)
labeled_data = labeled_data.join(pd.DataFrame(kmeans.labels_, columns=['group']))
labeled_data

In [None]:
labeled_data['group'].value_counts()

### Generate graph to visualize

In [None]:
len(labeled_data.columns)

In [None]:
Image('http://i.imgur.com/VLXVbsx.jpg')

Use t-SNE (https://lvdmaaten.github.io/tsne/)

In [None]:
tsne = TSNE(n_components=2, verbose=10)
tsne_transformed = tsne.fit_transform(tf_idf_transformed)
tsne_transformed

In [None]:
graphing_data = pd.DataFrame(tsne_transformed, columns=['X', 'Y'])
graphing_data

In [None]:
graphing_data.plot(kind='scatter',
                  x='X', y='Y',
                  figsize=(20, 16))
matplotlib.pyplot.axis('off')

Let's see it with the groupings!

In [None]:
graphing_data = graphing_data.join(labeled_data['group'])
graphing_data

In [None]:
colors = ['red', 'orange', 'yellow',
         'green', 'blue', 'purple',
         'black']

zipped = zip(labeled_data['group'].unique(), colors)

matplotlib.pyplot.figure(figsize=(10,8))
for groups, colors in zipped:
    matplotlib.pylab.scatter(graphing_data.loc[(graphing_data['group'] == groups), 'X'], 
                             graphing_data.loc[(graphing_data['group'] == groups), 'Y'],
                            c=colors,
                             label='Group #%s' % groups
                            )
    matplotlib.pyplot.legend()
matplotlib.pyplot.axis('off')

In [None]:
for group in labeled_data['group'].unique():
    print(group)
    for column in labeled_data.columns:
        mean = labeled_data.loc[(labeled_data['group'] == group), column].mean()
        if mean >= 2.0 and column not in ['group', 'deck_id']:
            print(column, ' -- average card inclusion: ', mean)

For fun, let's try and use a couple of libraries to do this in D3

One is to use nvd3 -- http://python-nvd3.readthedocs.org/en/latest/index.html (http://dataviztalk.blogspot.com/2016/01/make-great-looking-d3js-charts-in.html)

In [None]:
import nvd3
nvd3.ipynb.initialize_javascript(use_remote=True)
chart = nvd3.scatterChart(name='scatterChart', height=300, width=300)

In [None]:
for group in graphing_data['group'].unique():
    chart.add_serie(name='group %s' % group, 
                y=graphing_data.loc[(graphing_data['group'] == group), 'Y'].values,
                x=graphing_data.loc[(graphing_data['group'] == group), 'X'].values,
                size=1
                )
chart

## Note: You *can* do D3 in Jupyter, but it's a bit fiddly