In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import make_pipeline
from skater.core.local_interpretation.text_interpreter import vectorize_as_tf_idf, \
get_feature_names, topk_tfidf_features_by_class, topk_tfidf_features_overall, topk_tfidf_features_in_doc, \
dataframe_to_dict
from skater.core.visualizer.relevance_visualizer import build_html, generate_word_cloud, show_in_notebook
#import build_html, generate_word_cloud, show_in_notebook
from IPython.core.debugger import set_trace

In [2]:
categories = [
    'rec.autos',
    'rec.motorcycles',
    'rec.sport.baseball',
    'rec.sport.hockey',
    'sci.crypt',
    'sci.electronics',
    'sci.med',
    'sci.space',
]
remove = ('headers', 'footers', 'quotes')

data_train = fetch_20newsgroups(subset='train', categories=categories,
                                     shuffle=True, random_state=0,
                                     remove=remove)

In [3]:
X_train = data_train.data
y_train = data_train.target

In [4]:
X_train[0]

"Does anyone have a rear wheel for a PD they'd like to part with?\n\nDoes anyone know where I might find one salvage?\n\nAs long as I'm getting the GIVI luggage for Brunnhilde and have\nthe room, I thought I'd carry a spare.\n\nRide Free,\n\nBill\n___________________________________________________________________             \njohnsw@wsuvm1.csc.wsu.edu  prez=BIMC  KotV KotRR                                \nDoD #00314  AMA #580924   SPI = 7.18   WMTC #0002  KotD #0001             \nYamabeemer fj100gs1200pdr650 Special and a Volvo.  What more could anyone ask? "

In [5]:
data_test = fetch_20newsgroups(subset='test', categories=categories,
                                     shuffle=True, random_state=0,
                                     remove=remove)
X_text = data_test.data
y_test = data_test.target

In [6]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

print(tokenizer_stem_nostop('runners like running and thus they run'))

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['runner', 'like', 'run', 'thu', 'run']


In [7]:
from bs4 import BeautifulSoup

def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

In [8]:
X_train = [preprocessor(t) for t in X_train]
print(X_train[0])

does anyone have a rear wheel for a pd they d like to part with does anyone know where i might find one salvage as long as i m getting the givi luggage for brunnhilde and have the room i thought i d carry a spare ride free bill ___________________________________________________________________ johnsw wsuvm1 csc wsu edu prez bimc kotv kotrr dod 00314 ama 580924 spi 7 18 wmtc 0002 kotd 0001 yamabeemer fj100gs1200pdr650 special and a volvo what more could anyone ask  


In [9]:
# define param_dict for TfIdf Vectorizer
param_dict = {
    'sublinear_tf':True, 
    'max_df': 0.5,
    'stop_words': 'english', 
    'smooth_idf': True,
    'ngram_range':(1, 3)   
}
vectorizer, X_train = vectorize_as_tf_idf(X_train, **param_dict)
print("n_samples: %d, n_features: %d" % X_train.shape)

n_samples: 4762, n_features: 738849


In [10]:
# Names of all the features extracted
feature_names =  get_feature_names(vectorizer_inst=vectorizer)
print(len(feature_names))
print(feature_names[1000:1003])

738849
['0001 idea', '0001 idea motorcycle', '0001 yamabeemer']


In [11]:
# import pandas as pd

# def top_k_with_feature_selection(X, y, feature_names, top_k):
#     ch2 = SelectKBest(chi2, top_k)
#     X_new = ch2.fit_transform(X, y)
#     selected_feature = [(feature_names[i], X_new[i]) for i in ch2.get_support(indices=True)]
#     return ch2, X_new, selected_feature


# def _default_feature_selection(X, feature_names, top_k):
#     arg_sort = lambda r, k: np.argsort(r)[::-1][:k]
#     top_k_index = arg_sort(X, top_k)
#     top_features = [(feature_names[i], X[i]) for i in top_k_index]
#     return None, None, top_features


# def top_k_tfidf_features(X, features, feature_selection_type='default', top_k=25):
#     """ Computes top 'k' tf-idf values in a row.
#     chi-square statistical test for feature selection helps in weeding out the features that are most likely independent
#     of the categorization class or label
#     Parameters
#     __________
#     each_row:
#     features:
#     top_k:

#     Returns
#     _______
#     df : pandas.DataFrame

#     """
#     fs_choice_dict = {
#     'default': _default_feature_selection,
# #     'chi2': _feature_selection
#     }

#     type_inst, new_x, top_features = fs_choice_dict[feature_selection_type](X, features, top_k)
#     df = pd.DataFrame(top_features)
#     df.columns = ['features', 'tf_idf']
#     return df


# def topk_tfidf_features_in_doc(data, features, top_k=25):
#     """ Compute top tf-idf features for each document in the corpus

#     Returns
#     _______
#     pandas.DataFrame with columns 'features', 'tf_idf'
#     """
#     row = np.squeeze(data.toarray())
#     return top_k_tfidf_features(row, features, top_k)


# # Lamda for converting data-frame to a dictionary
# dataframe_to_dict = lambda key_column_name, value_column_name, df: df.set_index(key_column_name).to_dict()[value_column_name]


# def topk_tfidf_features_overall(data, feature_list, min_tfidf=0.1, feature_selection='default',
#                                 summarizer_type='mean', top_k=25):
#     """
#     """
#     d = data.toarray()
#     d[d < min_tfidf] = 0
#     summarizer_default = lambda x: np.sum(x, axis=0)
#     summarizer_mean = lambda x: np.mean(x, axis=0)
#     summarizer_median = lambda x: np.median(x, axis=0)
#     summarizer_choice_dict = {
#         'sum': summarizer_default,
#         'mean': summarizer_mean,
#         'median': summarizer_median
#     }

#     tfidf_summarized = summarizer_choice_dict[summarizer_type](d)
#     return top_k_tfidf_features(tfidf_summarized, feature_list, feature_selection, top_k)


# def topk_tfidf_features_class(X, y, feature_names, class_index, feature_selection='default',
#                                  summarizer_type='mean', topk_features=25, min_tfidf=0.1):
#     """
#     """
#     labels = np.unique(y)
#     ids_by_class = list(map(lambda label: np.where(y==label), labels))
#     feature_df = topk_tfidf_features_overall(data=X[ids_by_class[class_index]], feature_list=feature_names, 
#                                              min_tfidf=0.1, feature_selection='default',
#                                              summarizer_type=summarizer_type, top_k=topk_features)
#     feature_df.label = ids_by_class[class_index]
#     return feature_df

In [14]:
# 1. First Query: What are the top TF-IDF features in the training corpus
print("Class:{}".format(categories[1]))
topk_tfidf_features_by_class(X=X_train, y=y_train, feature_names=feature_names, class_index=0, feature_selection='default',
                                 summarizer_type='mean', topk_features=10, min_tfidf=0.1)

Class:rec.motorcycles


Unnamed: 0,features,tf_idf
0,car,0.002384
1,stuff deleted,0.002317
2,gt,0.002284
3,ites,0.002204
4,ford,0.001978
5,auto,0.001966
6,saturn,0.001959
7,deleted,0.001897
8,dealer,0.001819
9,vw,0.001818


In [13]:
print("Class:{}".format(categories[1]))
topk_tfidf_features_by_class(X=X_train, y=y_train, feature_names=feature_names, class_index=1, 
                             feature_selection='default', summarizer_type='mean', topk_features=10, min_tfidf=0.1)

Class:rec.motorcycles


Unnamed: 0,features,tf_idf
0,helmet,0.003295
1,bmw,0.002836
2,bike,0.002757
3,motorcycle,0.002623
4,dog,0.002615
5,shaft,0.002574
6,bikes,0.002327
7,dod,0.002184
8,honda,0.001939
9,harley,0.00193


In [15]:
from random import randint
row_index = randint(0, 3000)
print("Target Catgeory:{}".format(categories[y_train[1]]))
feature_df = topk_tfidf_features_in_doc(data=X_train[1], features=feature_names, 
                                        feature_selection_choice='default', top_k=10)
print(feature_df)

Target Catgeory:rec.sport.baseball
            features    tf_idf
0        ball caught  0.171343
1               ball  0.157728
2               base  0.144721
3             forced  0.132493
4            dropped  0.124181
5             caught  0.114289
6    run decide stay  0.109877
7       ball dropped  0.109877
8         run decide  0.109877
9  stay ball dropped  0.109877


In [20]:
print(data_train.data[1])
feature_relevance_wts = dataframe_to_dict('features', 'tf_idf', feature_df)
build_html(data_train.data[1], feature_relevance_wts, highlight_oov=True)
show_in_notebook('./rendered.html')






I'm not sure I understand this question. When the IF rule is invoked,
the batter is automatically out. This relieves the runners from being
forced to advance to the next base if the ball is not caught. Other
than that, isn't it just the same as any situation in which a runner on
a base is not forced to the next base on a dropped fly ball? That is,
if the ball is caught he can tag up and run (or decide to stay), and
if the ball is dropped he can have left the base at any time.

