In [98]:
from __future__ import print_function, division
%run ../basics.ipynb

Populating the interactive namespace from numpy and matplotlib


In [99]:
df = pd.read_csv('../data/auto_tagging.csv', encoding='utf-8')
df['tag'] = df['tag'].apply(lambda x: ' '.join(x.split(' | ')))
display(df.head(2))

print(df.columns)
print(df.count()/len(df)*100)

Unnamed: 0,id,description,title,subtitle,tag,syllabus,fold_num
0,http://videolectures.net/acml2013_herbrich_rea...,The last ten years have seen a tremendous grow...,"Distributed, Real-Time Bayesian Learning in On...",,machine_learning video in_depth,,2
1,http://videolectures.net/acml2013_lin_cost_sen...,Classification is an important problem in mach...,Cost-sensitive Classification: Algorithms and ...,,machine_learning video in_depth,,2


Index([u'id', u'description', u'title', u'subtitle', u'tag', u'syllabus',
       u'fold_num'],
      dtype='object')
id             100.000000
description     83.535109
title           99.757869
subtitle        28.087167
tag            100.000000
syllabus        11.138015
fold_num       100.000000
dtype: float64


In [100]:
#df = df[df[['title', 'subtitle', 'description', 'syllabus']].apply(lambda x: len((' '.join(map(str,x))).split()), axis=1) >= 100]

In [101]:
class Vectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, n_topics=20, tfidf_max_df=None, tfidf_min_df=None,
                 ngram_range=None, nmf=True, columns=None,
                 stop_words=None, threshold=.5, max_depth=None):
        self.n_topics = n_topics
        self.stop_words = stop_words
        self.ngram_range = ngram_range
        self.tfidf_min_df = tfidf_min_df
        self.tfidf_max_df = tfidf_max_df
        
        self.use_nmf = nmf
        self.nmf = nmf
        self.topic_dict = None
        self.columns = columns
    
    def fit(*args, **kwargs):
        self.fit_transform(*args, **kwargs)
        
    def fit_transform(self, X, y=None):
        viz=False
        self.vectorizer = TfidfVectorizer(stop_words=self.stop_words,
                                  max_df=self.tfidf_max_df,
                                  min_df=self.tfidf_min_df,
                                  ngram_range=self.ngram_range)
        if self.use_nmf:
            self.nmf = NMF(n_components=self.n_topics, random_state=2016)
        else:
            self.nmf = None
        orig_X = X.copy()
        X = self.prepare_X(X)
        X = self.vectorize_X(X, fit=True, viz=viz)
        if viz:
            try:
                tsne_plot(X, orig_X[[orig_X.columns[0]]], fit=True)
            except:
                print(traceback.format_exc())
        return X
    
    def transform(self, X):
        X = self.prepare_X(X)
        return self.vectorize_X(X, fit=False, viz=False)
    
    def prepare_X(self, X):
        if isinstance(X, pd.DataFrame):
            if self.columns is not None:
                X = X[self.columns]
            concat_x = pd.Series(index=X.index, data='')
            for i in X.columns:
                concat_x += ' ' + X[i].apply(text_cleanup)
            X = concat_x
        else:
            assert isinstance(X, pd.Series)
        #X = X.apply(text_cleanup)
        return X
        
    def vectorize_X(self, X, fit=False, viz=False):
        if fit:
            tfidf = self.vectorizer.fit_transform(X.values)
        else:
            tfidf = self.vectorizer.transform(X.values)
        #tfidf, words = filter_word_rep(tfidf, self.vectorizer.get_feature_names())
        words = self.vectorizer.get_feature_names()
        #print('tfidf shape', tfidf.shape)
        # tfidf = (tfidf > 0.).astype('float')
        if self.nmf is not None:
            if fit:
                nmf = self.nmf.fit_transform(tfidf)
                feature_names = words
                n_top_words = 50
                self.topic_dict = dict()
                for topic_idx, topic in enumerate(self.nmf.components_):
                    topic_top_words = [feature_names[i]
                                    for i in topic.argsort()[:-n_top_words - 1:-1]]
                    topic_top_words_vals = sorted(topic.ravel(), reverse=True)[:n_top_words]
                    text_freq = zip(topic_top_words, topic_top_words_vals)
                    if viz:
                        print("v v v v \tTopic #%d\t" % topic_idx, end='v v v v v')
                        wordcloud(text_freq)
                    topic_top_words = ", ".join(topic_top_words)
                    #print(topic_top_words)
                    self.topic_dict[topic_idx] = text_freq
                if viz:
                    viz_nmf_output(nmf)
            else:
                nmf = self.nmf.transform(tfidf)
            #print('nmf shape:', nmf.shape)
            return nmf
        else:
            return tfidf.todense()

In [102]:
class CLF(BaseEstimator):
    def __init__(self, class_weight=None, bootstrap=True,
                 n_estimators=10, stop_words=None, threshold=.5, max_depth=None):
        
        self.n_estimators = n_estimators
        self.bootstrap = bootstrap
        self.class_weight = class_weight
        self.vectorizer = None
        self.threshold = threshold
        self.max_depth = max_depth
        
        self.rf = None

    def fit(self, X, y):
        viz=False
        self.tag_vectorizer = CountVectorizer()
        self.rf = OneVsRestClassifier(RandomForestClassifier(n_estimators=self.n_estimators,
                                                             bootstrap=self.bootstrap,
                                                             class_weight=self.class_weight, 
                                                             max_depth=self.max_depth,
                                                             random_state=2016))

        y = self.vectorize_y(y, fit=True)
        filt = np.array(y.sum(axis=1) > 0).flatten() # don't train on untagged resources
        if sum(filt) != y.shape[0]:
            #print('excluding untagged resources for clf training')
            #print(y.shape[0], end=' -> ')
            y = y[filt, :]
            X = X[filt, :]
            #print(y.shape[0])
        self.rf.fit(X, y)
        
    def predict(self, X):
        return self.rf.predict_proba(X)
    
    def plot_word_and_tag_clouds(self, X, y, fit=False, th=0., viz=False):
        y = self.vectorize_y(y, fit=fit)
        topic_tags = defaultdict(lambda: np.zeros(y.shape[1]))
        topic_n_docs = defaultdict(int)
        for x_s, y_s in tqdm(zip(X, y)):
            x_s = np.array(x_s).flatten()
            y_s = np.array(y_s).flatten()
            for i, x_i in enumerate(x_s):
                if x_i >= th:
                    topic_tags[i] += (x_i * y_s)
                    topic_n_docs[i] += 1
        topic_freq = {key: zip(self.get_tag_names(), val) for key, val in topic_tags.items()}
        for key, text_freq in self.topic_dict.items():
            print('-' * 80)
            print('# resources:', topic_n_docs[key])
            wordcloud(text_freq)
            wordcloud(topic_freq[key])
    
    def vectorize_y(self, y, fit=False):
        if fit:
            y = y.fillna(' ')
            y = self.tag_vectorizer.fit_transform(y.values)
            #if self.tag_vectorizer.stop_words_:
            #    print('ignored tags:')
            #    print(' ,'.join(self.tag_vectorizer.stop_words_))
            #print('considered tags: (tag, resources tagged in %)')
            #print(' ,'.join(map(str, sorted(zip(self.tag_vectorizer.get_feature_names(), 
            #                  np.array(y.sum(axis=0)).flatten()/y.shape[0]*100, 
            #                  np.array(y.sum(axis=0)).flatten()), key=itemgetter(1), reverse=True))))
        else:
            y = self.tag_vectorizer.transform(y.values)
        y = (y > 0).astype('float').todense()
        #print('y shape', y.shape)
        return y
    
    def get_tag_names(self):
        return self.tag_vectorizer.get_feature_names()
    
    def score_all(self, X, y):
        y = self.vectorize_y(y, fit=False)
        y_pred = self.predict(X)
        tag_names = self.get_tag_names()
        precision = defaultdict(float)
        recall = defaultdict(float)
        f1 = defaultdict(float)
        roc_auc = defaultdict(float)
        support = defaultdict(float)
        for tdx, tag in enumerate(tag_names):
            y_t = np.array(y[:, tdx]).ravel()
            n_support = y_t.sum()
            y_p = np.array(y_pred[:, tdx]).ravel()
            if n_support > 0:
                roc_auc[tag] = roc_auc_score(y_t, y_p)
            y_p = y_p >= self.threshold
            precision[tag] = precision_score(y_t, y_p)
            recall[tag] = recall_score(y_t, y_p)
            f1[tag] = f1_score(y_t, y_p)
            support[tag] = n_support
        return {'recall': recall, 'precision': precision, 'f1':f1, 'roc_auc': roc_auc, 'support':support}
    
    def score(self, X, y):
        scores = self.score_all(X, y)
        f1, support = scores['f1'], scores['support']
        f1, weights = zip(*[(val, support[key]) for key, val in f1.items()])
        return np.average(f1, weights=weights) 

In [103]:
pipe = Pipeline(steps=[('vect', Vectorizer()), ('rf', CLF())])

In [104]:
num_topics = 30
#clf = CLF(n_topics=num_topics, tfidf_max_df=.6, 
#          tfidf_min_df=5, 
#          columns=['title', 'description', 'syllabus'],
#          nmf=True, n_estimators=6, class_weight=None, stop_words='english', ngram_range=(1,2))

pipe.set_params(**{'vect__nmf': True, 'vect__stop_words': 'english', 'rf__bootstrap': False, 'rf__n_estimators': 5, 
             'vect__ngram_range': (1, 2), 'rf__threshold': 0.5, 'vect__n_topics': 30, 'vect__tfidf_max_df': 0.6, 
           'vect__tfidf_min_df': 8, 'vect__columns': ['title', 'subtitle', 'description', 'syllabus'], 
             'rf__class_weight': None, 'rf__max_depth': 5})

print('# resources:', len(df))
print('tagged resources:', (df['tag'].apply(len) > 0).sum())
pipe.fit(df[['title', 'subtitle', 'description', 'syllabus']], df['tag'])
#clf.plot_word_and_tag_clouds(df[['title', 'subtitle', 'description', 'syllabus']], df['tag'], th=.01)

# resources: 413
tagged resources: 413


Pipeline(steps=[('vect', Vectorizer(columns=['title', 'subtitle', 'description', 'syllabus'],
      max_depth=None, n_topics=30, ngram_range=(1, 2),
      nmf=NMF(alpha=0.0, beta=1, eta=0.1, init=None, l1_ratio=0.0, max_iter=200,
  n_components=30, nls_max_iter=2000, random_state=2016, shuffle=False,
  solve...bootstrap=False, class_weight=None, max_depth=5, n_estimators=5,
  stop_words=None, threshold=0.5))])

In [64]:
y_pred = pipe.predict(df[['title', 'subtitle', 'description', 'syllabus']])

In [65]:
pipe.named_steps['rf']

CLF(bootstrap=False, class_weight=None, max_depth=5, n_estimators=5,
  stop_words=None, threshold=0.5)

In [54]:
pos_th = 0.5
neg_th = 0.5
tags_rec = True
tags_rev = True
recs = list()
revs = list()
y_true = pipe.named_steps['rf'].vectorize_y(df['tag'])
tags = np.array(pipe.named_steps['rf'].get_tag_names())

print(classification_report(y_true, y_pred > .5, target_names=tags))

for idx, title in enumerate(df['title']):
    y_pred_row = np.array(y_pred[idx, :]).flatten()
    y_true_row = np.array(y_true[idx, :]).flatten()
    recommended_tags = (y_pred_row >= pos_th) & (y_true_row < 0.5)
    recommended_tags = zip(tags[recommended_tags], y_pred_row[recommended_tags])
    
    revise_tags = (y_pred_row <= neg_th) & (y_true_row > 0.5)
    revise_tags = zip(tags[revise_tags], y_pred_row[revise_tags])
    
    if (len(recommended_tags) > 0 and tags_rec) or (len(revise_tags) > 0 and tags_rev):
        print(title)
        df_entry = df.iloc[idx]
        c_id = df_entry['id']
        print(c_id)
        if len(recommended_tags) > 0 and tags_rec:
            print('  recommended tags:')
            for i, p in sorted(recommended_tags, key=itemgetter(1), reverse=True):
                print('\t', i, '(%.2f' % (p * 100), '% )')
                recs.append((c_id, i, p * 100))
        if len(revise_tags) > 0 and tags_rev:
            print('  revise tags:')
            for i, p in sorted(revise_tags, key=itemgetter(1)):
                print('\t', i, '(%.2f' % ((1. - p) * 100), '% )')
                revs.append((c_id, i, (1. - p) * 100))
        print('-' * 80)

                               precision    recall  f1-score   support

      artificial_intelligence       1.00      1.00      1.00         5
    association_rule_learning       1.00      1.00      1.00         1
                         bd2k       1.00      1.00      1.00         9
                     big_data       1.00      0.71      0.83        24
                      biology       1.00      1.00      1.00         3
           business_analytics       1.00      0.88      0.93        57
                   case_study       1.00      0.16      0.27        32
               classification       1.00      1.00      1.00         1
                   clustering       1.00      0.67      0.80         6
             computer_science       1.00      1.00      1.00         6
                data_cleaning       1.00      0.86      0.92         7
              data_collection       1.00      0.61      0.76        23
             data_integration       1.00      0.56      0.72        16
     

In [None]:
rec_df = pd.DataFrame(columns=['id', 'tag', 'probability'], data=recs)
rec_df.to_csv('tag_recommendations.csv', encoding='utf-8', index=False, quoting=QUOTE_ALL)

In [None]:
rev_df = pd.DataFrame(columns=['id', 'tag', 'probability'], data=revs)
rev_df.to_csv('tag_revision.csv', encoding='utf-8', index=False, quoting=QUOTE_ALL)

In [None]:
pd.DataFrame((y_pred > 0.5).sum(axis=1)).plot(kind='hist', bins=20)

In [None]:
pd.DataFrame((y_true > 0.5).sum(axis=1)).plot(kind='hist', bins=20)

In [None]:
#df['fold'] = 0
#for fidx, (train_idx, test_idx) in enumerate(KFold(len(df), n_folds=5, shuffle=True)):
#    df['fold'].iloc[test_idx] = fidx
#print(pd.unique(df['fold']))

In [107]:
for i in ['title', 'subtitle', 'description', 'syllabus']:
    df[i] = df[i].apply(text_cleanup)

#%run ../basics.ipynb
print('# resources:', len(df))
print('tagged resources:', (df['tag'].apply(len) > 0).sum())
tag_idx = df[df['tag'].apply(len) > 0].index
all_idx = set(df.index)
all_y_pred = list()
all_y_true = list()
tag_names = list()

param_grid = dict()
param_grid['vect__n_topics'] = [20, 30, 40, 50]
param_grid['vect__tfidf_max_df'] = [.4, .6, .8, .9]
param_grid['vect__tfidf_min_df'] = [3, 5, 8, 10]
param_grid['vect__ngram_range'] = [(1,1), (1,2), (1,3)]
param_grid['vect__nmf'] = [False, True]
param_grid['vect__columns'] = [['title', 'subtitle', 'description'], 
                         ['title', 'subtitle', 'description', 'syllabus'],  
                         ['title', 'description', 'syllabus'], 
                        ]
param_grid['vect__stop_words'] = [None, 'english']

param_grid['rf__class_weight'] = [None, 'balanced']
param_grid['rf__bootstrap'] = [False, True]
param_grid['rf__n_estimators'] = [1, 3, 4, 5, 6, 8, 10, 20]

param_grid['rf__threshold'] = [.1, .25, .5] #, .75, .9]
param_grid['rf__max_depth'] = [1, 3, 6, 8, 10, 12, 15]

gcv = DaskGridSearchCV(pipe, param_grid, refit=True, cv=LeaveOneLabelOut(df['fold_num']))
pbar = ProgressBar()
pbar.register()
gcv.fit(df[['title', 'subtitle', 'description', 'syllabus']], df['tag'])

# resources: 413
tagged resources: 413


KeyboardInterrupt: 

In [None]:
print('best f1:', gcv.best_score_)
best_params = gcv.best_params_
print('best params:', gcv.best_params_)

In [None]:
for param_of_interest in gcv.best_params_.keys():
    scores = list()
    for i in gcv.grid_scores_:
        params = i.parameters
        score = i.mean_validation_score
        param_val = params[param_of_interest]
        if param_val is None:
            param_val = 'None'
        if isinstance(param_val, list):
            param_val = tuple(param_val)
        scores.append((param_val, score))
    sns.violinplot(data=pd.DataFrame(columns=[param_of_interest, 'f1 score'], data=scores), x=param_of_interest, y='f1 score')
    if not os.path.isdir('grid_search_cv/'):
        os.makedirs('grid_search_cv/')
    plt.savefig('grid_search_cv/' + param_of_interest.replace(" ", "_") + '.png', dpi=150)
    plt.show()

In [None]:
data = list()
columns = ['precision', 'recall', 'f1', 'roc_auc','support']
for train_idx, test_idx in tqdm(LeaveOneLabelOut(df['fold_num']), total=len(pd.unique(df['fold_num']))):
    fold_clf = CLF(**gcv.best_params_)
    fold_clf.fit(df.iloc[train_idx], df['tag'].iloc[train_idx])
    fold_scores = fold_clf.score_all(df.iloc[test_idx], df['tag'].iloc[test_idx])
    for tag in fold_clf.get_tag_names():
        d = tuple([tag] + [fold_scores[i][tag] for i in columns])
        data.append(d)

cf_results = pd.DataFrame(columns=['tag'] + columns, data=data)

In [None]:
def average_tag_score(gdf):
    d = list()
    try:
        for i in columns[:-1]:
            d.append(np.average(gdf[i], weights=gdf['support']))
    except ZeroDivisionError:
        d = [-.1] * len(columns[:-1])
    d.append(gdf['support'].sum())
    return pd.DataFrame(columns=columns, data=[tuple(d)])

scores_per_tag = cf_results.groupby(by='tag').apply(average_tag_score)
scores_per_tag.sort_values(by='precision', inplace=True, ascending=False)
pd.set_option('display.max_rows', len(scores_per_tag))
display(scores_per_tag)
pd.reset_option('display.max_rows')

In [None]:
scores_per_tag['x'] = range(len(scores_per_tag))
scores_per_tag.plot(x='x', y=['f1', 'precision', 'recall', 'roc_auc', 'support'], secondary_y=['support'])
plt.show()