In [1]:
import os, json
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
OUTPUT_FOLDER = 'data'

# Process the 20 Newsgroups Dataset

In [3]:
from sklearn.datasets import fetch_20newsgroups

data = fetch_20newsgroups()

def parse_document(s):
    header, content = s.split('\n\n', 1)
    source, title, _ = header.split('\n', 2)
    return {'source': source.replace('From:', ''),
            'title': title.replace('Subject: ', ''),
            'content': content.strip()}

targets = {
 'comp.os.ms-windows.misc'  : 'Software',
 'comp.windows.x'           : 'Software',
 'comp.sys.ibm.pc.hardware' : 'Hardware',
 'comp.sys.mac.hardware'    : 'Hardware',
 'misc.forsale'             : 'For sale',
 'rec.autos'                : 'Autos',
 'rec.motorcycles'          : 'Autos',
 'rec.sport.baseball'       : 'Sports',
 'rec.sport.hockey'         : 'Sports',
 'sci.med'                  : 'Medicine',
 'sci.space'                : 'Space'
}

df = pd.DataFrame(list(map(parse_document, data.data)))
df['group'] = [targets.get(x, 'None') for x in np.array(data.target_names)[data.target]]
df = df[df.group != 'None']
df.head()

Unnamed: 0,source,title,content,group
0,lerxst@wam.umd.edu (where's my thing),WHAT car is this!?,I was wondering if anyone out there could enli...,Autos
1,guykuo@carson.u.washington.edu (Guy Kuo),SI Clock Poll - Final Call,A fair number of brave souls who upgraded thei...,Hardware
2,twillis@ec.ecn.purdue.edu (Thomas E Willis),PB questions...,"well folks, my mac plus finally gave up the gh...",Hardware
4,jcm@head-cfa.harvard.edu (Jonathan McDowell),Re: Shuttle Launch Question,"From article <C5owCB.n3p@world.std.com>, by to...",Space
6,bmdelane@quads.uchicago.edu (brian manning de...,Brain Tumor Treatment (thanks),There were a few people who responded to my re...,Medicine


# Add text documents to local storage

In [4]:
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

docs = df.to_dict(orient='records')

for i, obj in zip(df.index, docs):
    fname = os.path.join(OUTPUT_FOLDER, f'{i}.json')
    if not os.path.exists(fname):
        with open(fname, 'w') as fp:
            json.dump(obj, fp)

# Create a Newsgroups application

# Fit a model

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer, StandardScaler

import nltk
from nltk.stem.porter import PorterStemmer

nltk.download('punkt')

class JSONTfidfVectorizer(TfidfVectorizer):
    def __init__(self, field=None, **kwargs):
        self.field = field
        self.stemmer = PorterStemmer()
        
        super().__init__(**kwargs,
                         preprocessor=self.extract,
                         tokenizer=self.stem
                        )
        
    def fit(self, X, y=None):
        self.stem_table = {}
        super().fit(X, y)
        return self
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
    
    def stem(self, text):
        return [self.stem_table.setdefault(self.stemmer.stem(item), item)
                for item in nltk.word_tokenize(text)
                if item.isalnum()]
        
    def extract(self, obj):
        if self.field is None:
            return obj
        return obj[self.field]

text_pipeline = Pipeline([
    ('tfidf', JSONTfidfVectorizer(stop_words='english', min_df=5, max_df=.2, sublinear_tf=True)),
    ('nmf', NMF(n_components=30)),
    ('norm', Normalizer('l1')),
    ('clf', LogisticRegression())
])

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     self signed certificate in certificate chain
[nltk_data]     (_ssl.c:1123)>


In [7]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, train_size=.25)

text_pipeline.fit(df_train.content, df_train.group)

def predict(ser):
    return dict(
        pred_group=text_pipeline.predict(ser),
        confidence=text_pipeline.predict_proba(ser).max(axis=1)
    )

result = pd.concat((
    df_train.assign(**predict(df_train.content), train='yes'),
    df_test.assign(**predict(df_test.content), train='no')
))

result



Unnamed: 0,source,title,content,group,pred_group,confidence,train
4219,Organization: Penn State University,From: <ACM108@psuvm.psu.edu>,I'm thinking about upgrading my 030 50MHz to t...,Hardware,Hardware,0.308506,yes
7311,jfc@athena.mit.edu (John F Carr),Re: Gamma Ray Bursters. WHere are they.,"If gamma ray bursters are extragalactic, would...",Space,Space,0.400546,yes
2623,klute@tommy.informatik.uni-dortmund.de (Raine...,Re: imake - help needed,In article <1993Apr20.101306.21536@def.bae.co....,Software,Software,0.805104,yes
10283,dmaluso@MtHolyoke.edu (Diane Maluso),Quadra 800 configurations??,I've noticed some of you mentioning owning a Q...,Hardware,Hardware,0.681051,yes
7425,alung@megatest.com (Aaron Lung),"Re: Changing oil by self.rist, another dealer ...",In article <1qk5m9$pbe@news.ysu.edu> ak296@yfn...,Autos,Autos,0.940929,yes
...,...,...,...,...,...,...,...
9611,nsmca@aurora.alaska.edu,Space Design Movies?,Is there a few Grasp pictures of space related...,Space,Space,0.627885,no
11138,prb@access.digex.com (Pat),Re: NAVSTAR positions,C-3's bird may be flaking out and expecting to...,Space,Space,0.695932,no
7518,robert@pest (Robert Merlicek),ati GUP and Vpic,Could someone tell me if the ATI graphic ultra...,Hardware,Hardware,0.467670,no
4546,manish@uclink.berkeley.edu (Manish Vij),Shipping a bike,Can someone recommend how to ship a motorcycle...,Autos,Autos,0.777592,no


In [8]:
def yes_or_no(ser):
    return ser.replace({1: 'yes', 0: 'no', False: 'no', True: 'yes'})

result = result.assign(
    correct=yes_or_no(result.group == result.pred_group),
    response=yes_or_no(result.title.str.startswith('Re:')),
    length=result.content.apply(len)
)

result

Unnamed: 0,source,title,content,group,pred_group,confidence,train,correct,response,length
4219,Organization: Penn State University,From: <ACM108@psuvm.psu.edu>,I'm thinking about upgrading my 030 50MHz to t...,Hardware,Hardware,0.308506,yes,yes,no,389
7311,jfc@athena.mit.edu (John F Carr),Re: Gamma Ray Bursters. WHere are they.,"If gamma ray bursters are extragalactic, would...",Space,Space,0.400546,yes,yes,yes,330
2623,klute@tommy.informatik.uni-dortmund.de (Raine...,Re: imake - help needed,In article <1993Apr20.101306.21536@def.bae.co....,Software,Software,0.805104,yes,yes,yes,727
10283,dmaluso@MtHolyoke.edu (Diane Maluso),Quadra 800 configurations??,I've noticed some of you mentioning owning a Q...,Hardware,Hardware,0.681051,yes,yes,no,1063
7425,alung@megatest.com (Aaron Lung),"Re: Changing oil by self.rist, another dealer ...",In article <1qk5m9$pbe@news.ysu.edu> ak296@yfn...,Autos,Autos,0.940929,yes,yes,yes,1209
...,...,...,...,...,...,...,...,...,...,...
9611,nsmca@aurora.alaska.edu,Space Design Movies?,Is there a few Grasp pictures of space related...,Space,Space,0.627885,no,yes,no,507
11138,prb@access.digex.com (Pat),Re: NAVSTAR positions,C-3's bird may be flaking out and expecting to...,Space,Space,0.695932,no,yes,yes,240
7518,robert@pest (Robert Merlicek),ati GUP and Vpic,Could someone tell me if the ATI graphic ultra...,Hardware,Hardware,0.467670,no,yes,no,479
4546,manish@uclink.berkeley.edu (Manish Vij),Shipping a bike,Can someone recommend how to ship a motorcycle...,Autos,Autos,0.777592,no,yes,no,239


In [9]:
import crosscheck as cc

cc.HistogramHeatmap(
    result,
    by=['group', 'pred_group', 'response'],
    component='NewsgroupsComponent',
    prefix=cc.get_jupyter_url_from_local_path('data'),
    suffix='.json'
)

HistogramHeatmap(component='HistogramHeatmap', props={'rows': 'group', 'cols': 'pred_group', 'values': 'respon…

In [10]:
cc.HistogramHeatmap(
    result,
    by=['group', 'pred_group', 'confidence'],
    component='NewsgroupsComponent',
    prefix=cc.get_jupyter_url_from_local_path('data'),
    suffix='.json'
)

HistogramHeatmap(component='HistogramHeatmap', props={'rows': 'group', 'cols': 'pred_group', 'values': 'confid…