In [1]:
import os, json
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
OUTPUT_FOLDER = 'data'

# Process the 20 Newsgroups Dataset

In [3]:
from sklearn.datasets import fetch_20newsgroups

data = fetch_20newsgroups()

def parse_document(s):
    header, content = s.split('\n\n', 1)
    source, title, _ = header.split('\n', 2)
    return {'source': source.replace('From:', ''),
            'title': title.replace('Subject: ', ''),
            'content': content.strip()}

targets = {
 'comp.os.ms-windows.misc'  : 'Software',
 'comp.windows.x'           : 'Software',
 'comp.sys.ibm.pc.hardware' : 'Hardware',
 'comp.sys.mac.hardware'    : 'Hardware',
 'misc.forsale'             : 'For sale',
 'rec.autos'                : 'Autos',
 'rec.motorcycles'          : 'Autos',
 'rec.sport.baseball'       : 'Sports',
 'rec.sport.hockey'         : 'Sports',
 'sci.med'                  : 'Medicine',
 'sci.space'                : 'Space'
}

df = pd.DataFrame(list(map(parse_document, data.data)))
df['group'] = [targets.get(x, 'None') for x in np.array(data.target_names)[data.target]]
df = df[df.group != 'None']
df.head()

Unnamed: 0,source,title,content,group
0,lerxst@wam.umd.edu (where's my thing),WHAT car is this!?,I was wondering if anyone out there could enli...,Autos
1,guykuo@carson.u.washington.edu (Guy Kuo),SI Clock Poll - Final Call,A fair number of brave souls who upgraded thei...,Hardware
2,twillis@ec.ecn.purdue.edu (Thomas E Willis),PB questions...,"well folks, my mac plus finally gave up the gh...",Hardware
4,jcm@head-cfa.harvard.edu (Jonathan McDowell),Re: Shuttle Launch Question,"From article <C5owCB.n3p@world.std.com>, by to...",Space
6,bmdelane@quads.uchicago.edu (brian manning de...,Brain Tumor Treatment (thanks),There were a few people who responded to my re...,Medicine


# Add text documents to local storage

In [4]:
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

docs = df.to_dict(orient='records')

for i, obj in zip(df.index, docs):
    fname = os.path.join(OUTPUT_FOLDER, f'{i}.json')
    if not os.path.exists(fname):
        with open(fname, 'w') as fp:
            json.dump(obj, fp)

# Create a Newsgroups application

# Fit a model

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer, StandardScaler

import nltk
from nltk.stem.porter import PorterStemmer

nltk.download('punkt')

class JSONTfidfVectorizer(TfidfVectorizer):
    def __init__(self, field='content', **kwargs):
        self.field = field
        self.stemmer = PorterStemmer()
        
        super().__init__(**kwargs,
                         preprocessor=self.extract,
                         tokenizer=self.stem
                        )
        
    def fit(self, X, y=None):
        self.stem_table = {}
        super().fit(X, y)
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
    
    def stem(self, text):
        return [self.stem_table.setdefault(self.stemmer.stem(item), item)
                for item in nltk.word_tokenize(text)
                if item.isalnum()]
        
    def extract(self, obj):
        return obj[self.field]

TextPipeline = Pipeline([
    ('tfidf', JSONTfidfVectorizer(stop_words='english', min_df=5, max_df=.2, sublinear_tf=True)),
    ('the topic model!!!', NMF(n_components=30)),
    ('norm', Normalizer('l1')),
])

X = pd.DataFrame(
    TextPipeline.fit_transform(docs),
    index=df.index
)

[nltk_data] Downloading package punkt to /Users/aren438/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Visualize results
We are looking for something with more structure than a big blob if it worked correctly

In [6]:
with open('data/0.json') as fp:
    obj = json.load(fp)
    
obj

{'source': " lerxst@wam.umd.edu (where's my thing)",
 'title': 'WHAT car is this!?',
 'content': 'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----',
 'group': 'Autos'}

In [7]:
from umap import UMAP
import chissl

self = chissl.ChisslWidget(X,
    features=df[['group']],
    component='NewsgroupsComponent',
    prefix='/files/chissl-widget/notebooks/twenty-newsgroups/data/',
    suffix='.json'
)

self

ChisslWidget(component='ChisslWidget', props={'parents': [8358, 10660, 7193, 9332, 9295, 9484, 7374, 8489, 803…

In [10]:
self

ChisslWidget(component='ChisslWidget', predictions={'previousClasses': [None, None, None, None, None, None, No…

In [9]:
# notes
# generalizability? or just for phishing
# prodigy recipe
# progress fixing cookiecutter

# better email vis
# genx
# jupyter hub