In [1]:
from __future__ import print_function, division
from graph_tool.all import *
%pylab inline
import numpy as np
import pandas as pd
import seaborn as sns
from bokeh.plotting import *
import random
from csv import QUOTE_ALL
import datetime

from bokeh.io import output_notebook
output_notebook()
from bokeh.charts import Scatter
from bokeh.models import HoverTool, ColumnDataSource 
from collections import Counter, defaultdict
from operator import itemgetter
from textstat.textstat import textstat
from termcolor import colored
import os
import copy
import re
import joblib

os.environ["KERAS_BACKEND"] = "tensorflow"
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense, Flatten, Lambda
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.normalization import BatchNormalization
from keras.layers.embeddings import Embedding
from keras.layers.core import Reshape
from keras.layers.pooling import MaxPooling2D, AveragePooling2D, MaxPooling1D
from keras.engine import Input
from keras.layers import Merge
from keras.layers import merge
from keras.layers import LSTM, GRU
from keras.models import Model
from keras.utils.np_utils import to_categorical
from keras.layers.convolutional import Convolution2D, Convolution1D
from keras.regularizers import WeightRegularizer
from keras.optimizers import RMSprop, SGD
from keras.regularizers import l2, activity_l2, activity_l1, l1l2, l1
from keras.callbacks import EarlyStopping
from keras import backend as K

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.metrics import accuracy_score, hamming_loss, mean_squared_error, label_ranking_average_precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedShuffleSplit, StratifiedKFold, LabelKFold, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import NMF
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize

from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

from tqdm import tqdm, tqdm_pandas
from pprint import pprint
import string
allowed_chars = set(string.ascii_lowercase) | {' '}
from wordcloud import WordCloud
from IPython.display import display, HTML
import traceback
import warnings
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


Using TensorFlow backend.


In [None]:
def remove_text_inside_brackets(text):
    return re.sub(r'\[[^\]]*\]', '', text)

In [None]:
def text_cleanup(text, filter_w=None, min_len=None):
    if isinstance(text, float):
        return ''
    text = text.lower()
    text = text.replace('videolectures net', ' ').replace('videolectures.net', ' ')
    text = text.replace(' nan ', ' ')
    #text = ' '.join((stemmer.stem(i.decode('utf-8')) for i in text.split()))
    if text == 'nan':
        text = ''
    text = remove_text_inside_brackets(text)
    text = ''.join([i if i in allowed_chars else ' ' for i in text])
    if filter_w is not None:
        text = [i for i in text.split() if i in filter_w]
        text = ' '.join(text)
    if min_len is not None:
        text = ' '.join(i for i in text.split() if len(i) >= min_len)
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    while '  ' in text:
        text = text.replace('  ', ' ')
    return text

In [None]:
def gen_weights(y):
    w = np.zeros(len(y))
    w_pos = w > 0
    w_neg = w < 1
    w[w_pos] = len(y)/2./len(w_pos)
    w[w_neg] = len(y)/2./len(w_neg)
    return w

In [None]:
def plot_nn_hist(hist, secondary_y=False):
    data = zip(hist.history['loss'], hist.history['val_loss'])
    hist_df = pd.DataFrame(columns=['train', 'val'], data=data)
    hist_df.plot(y=['train', 'val'], secondary_y=['val'] if secondary_y else None)
    plt.show()

In [None]:
def plot_prediction_violins(y_true, y_pred):
    pred = pd.DataFrame(columns=['prediction', 'true'], data=zip(y_pred, y_true))
    sns.violinplot(x='true', y='prediction', data=pred)
    plt.show()

In [None]:
class Digitizer:
    def __init__(self):
        self.word_dict = defaultdict(lambda: len(self.word_dict))
        self.word_dict[0] = None
    def series_digitizer(self, text_series, max_len=-1):
        digitzed_ser = text_series.apply(self.digitize, args=(max_len,)).astype('object')
        max_len = digitzed_ser.apply(len).max()
        digitzed_ser = digitzed_ser.apply(lambda x: x if len(x) == max_len else x + [0] * (max_len - len(x)))
        return digitzed_ser
        
    def digitize(self, text, max_len=-1):
        if max_len < 0:
            return [self.word_dict[i] for i in text.split()]
        else:
            return [self.word_dict[i] for i in text.split()[:max_len]]
    def num_words(self):
        return len(self.word_dict)

In [None]:
def score(y_true, y_pred, classes, score_f):
    scores = list()
    for i in classes:
        t = y_true == i
        p = y_pred == i
        scores.append(score_f(t, p))
    return scores

In [None]:
def evaluate(y_true, y_pred):
    data = list()
    y_pred_int = np.round(y_pred).astype('int')
    classes = sorted(set(y_true))
    data.extend([('f1(' + str(i) + ')', s) for i,s in zip(classes, score(y_true, y_pred_int, classes, f1_score))])
    data.extend([('precision(' + str(i) + ')', s) for i,s in zip(classes, score(y_true, y_pred_int, classes, precision_score))])
    data.extend([('recall(' + str(i) + ')', s) for i,s in zip(classes, score(y_true, y_pred_int, classes, recall_score))])
    return data

In [None]:
def viz_res(data, title=''):
    df = pd.DataFrame(columns=['type', 'value'], data=data)
    sns.barplot(x='type', y='value', data=df)
    plt.xticks(rotation=90)
    plt.title(title)
    plt.show()

In [None]:
def plot_tfidf_vals(vectorized_text):
    tfidf_vals = pd.DataFrame(data=vectorized_text.copy().ravel())
    tfidf_vals = tfidf_vals[tfidf_vals[0] > 0]
    tfidf_vals.plot(kind='hist', bins=200, logy=True)
    plt.show()
    tfidf_vals = pd.DataFrame(data=vectorized_text.max(axis=1).ravel())
    sns.distplot(tfidf_vals)
    plt.title('max tfidf vals per doc')
    plt.show()
    tfidf_vals = pd.DataFrame(data=vectorized_text.max(axis=0).ravel())
    sns.distplot(tfidf_vals)
    plt.title('max tfidf vals per term')
    plt.show()

In [None]:
def filter_tfidf_max(tfidf, words, min_th=None, max_th=None):
    tfidf = tfidf.copy()
    words = np.array(words)
    if min_th is not None:
        keep = tfidf.max(axis=0) >= min_th
        try:
            keep = keep.flatten()
        except:
            keep = np.array(keep.todense()).flatten()
        words = words[keep]
        tfidf = tfidf[:, keep]
    if max_th is not None:
        keep = tfidf.max(axis=0) <= max_th
        try:
            keep = keep.flatten()
        except:
            keep = np.array(keep.todense()).flatten()
        words = words[keep]
        tfidf = tfidf[:, keep]
    assert len(words) == tfidf.shape[1]
    return tfidf, words

In [None]:
def filter_word_rep(tfidf, words):
    words = np.array(words)
    bidx = np.array([True if len(w) == 1 or len(set(w)) == len(w) else False for w in map(lambda x: x.split(), words)], 
                   dtype=bool)
    words = words[bidx]
    tfidf = tfidf[:, bidx]
    return tfidf, words

In [None]:
def wordcloud(text_freq, ax=None, **kwargs):
    wordcloud = WordCloud(**kwargs).generate_from_frequencies(text_freq)
    if ax is None:
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.show()
    else:
        ax.imshow(wordcloud)

In [None]:
def get_prereq_graph(title_topic, topic_prereq, draw=True):
    g = Graph(directed=True)
    v_dict = defaultdict(g.add_vertex)
    
    topic_to_titles = defaultdict(list)
    for title, topic in title_topic:
        topic_to_titles[topic].append(title)

    topic_prereq_dict = defaultdict(list)
    for topic, prereq in topic_prereq:
        topic_prereq_dict[topic].append(prereq)
    v_text = g.new_vertex_property('string')
    v_topic = g.new_vertex_property('int')
    e_c = g.new_edge_property('float')
    for title, topic in title_topic:
        v = v_dict[title]
        v_text[v] = title
        v_topic[v] = topic
        prereq = topic_prereq_dict[topic]
        for p in prereq:
            p_titles = topic_to_titles[p]
            for p_t in p_titles:
                p_v = v_dict[p_t]
                e = g.add_edge(p_v, v)
                e_c[e] = p
    print(g)
    if draw:
        #pos = arf_layout(g)
        pos = sfdp_layout(g, groups=v_topic, C=4, p=3, mu_p=.9)
        g.vp['pos'] = pos
        deg = g.degree_property_map('out')
        deg.a = 4 * (np.sqrt(deg.a) * 0.5 + 0.4)
        g.set_reversed(True)
        pr = pagerank(g)
        g.set_reversed(False)

        ebet = betweenness(g)[1]
        ebet.a /= ebet.a.max()
        ebet.a *= 10.
        e_c.a /= e_c.a.max()
        v_c = g.new_vertex_property('float')
        v_c.a = v_topic.a.astype('float')
        v_c.a /= v_c.a.max()
        graph_draw(g, pos, output_size=(15000, 15000), 
                   vertex_text=v_text, inline=True, 
                   edge_color=e_c,
                   edge_pen_width=ebet,
                   output='prereq_graph.png', vorder=deg,
                   vertex_fill_color=v_c, marker_size=50, bg_color=[1.,1., 1., 1.])
    return g

In [None]:
def plot_topic_coocurrence(transformed):
    cooc = transformed.T.dot(transformed)
    np.fill_diagonal(cooc, 0)
    sns.heatmap(cooc)
    #plt.grid('off')
    plt.show()

In [None]:
class Digitizer:
    def __init__(self):
        self.word_dict = defaultdict(lambda: len(self.word_dict))
        self.word_dict[0] = None
    def series_digitizer(self, text_series, max_len=-1):
        digitzed_ser = text_series.apply(self.digitize, args=(max_len, )).astype('object')
        max_len = digitzed_ser.apply(len).max()
        digitzed_ser = digitzed_ser.apply(lambda x: x if len(x) == max_len else x + [0] * (max_len - len(x)))
        return digitzed_ser
        
    def digitize(self, text, max_len=-1):
        if max_len < 0:
            return [self.word_dict[i] for i in text.split()]
        else:
            return [self.word_dict[i] for i in text.split()[:max_len]]
    def num_words(self):
        return len(self.word_dict)

In [None]:
def tsne_plot(X, df, fit=True):
    if fit:
        tsne_plot.tsne = TSNE(n_components=2, learning_rate=500, n_iter=1000000, random_state=2016)
        tsne_text = tsne_plot.tsne.fit_transform(X) # [n_samples, n_features]
    else:
        tsne = tsne_plot.tsne.transform(X)
        
    plot_df = pd.DataFrame(columns=['x', 'y'], data=tsne_text, index=df.index)
    for c in df.columns:
        plot_df[c] = df[c]

    print(len(plot_df))
    hover_tips = [(i[0].upper(), '@' + i) for i in df.columns]
    hover = HoverTool(
        tooltips=hover_tips
    )

    source = ColumnDataSource(plot_df)
    p = figure(background_fill='#DFDFE5', plot_width=800, 
                              plot_height=600)
    # Add the hover tool
    p.add_tools(hover)

    # Populate glyphs
    p.circle(x='x', y='y', size=7, alpha=0.3, source=source)
    #p = Scatter(plot_df, title="scat", width=800, tools=[hover, "pan","box_zoom","wheel_zoom","reset","resize","save"])
    show(p)
    return plot_df
tsne_plot.tsne = None

In [None]:
def viz_nmf_output(nmf):
    sns.distplot(nmf.argmax(axis=1).flatten(), bins=nmf.shape[1])
    plt.title('documents over topics distribution')
    plt.show()
    
    data = list()
    for r in nmf:
        for idx, i in enumerate(sorted(r, reverse=True)):
            data.append((idx, i))
    dist_df = pd.DataFrame(columns=['pos', 'val'], data=data)
    sns.barplot(x='pos', y='val', data=dist_df)
    plt.title('certainty of topic assignment')
    plt.show()

In [None]:
def cross_entropy_prereq(mat, th=0.1, entropy_ge=True, cross_entropy_diff=True):
    prereq = list()
    eps = np.finfo(float).eps
    log = np.log
    save_log = lambda x: log(x + eps)
    entropy = lambda x: - np.sum(x_i * save_log(x_i) for x_i in x)
    cross_entropy = lambda x, y: - np.sum(x_i * save_log(y_i) for x_i, y_i in 
                                          filter(lambda (x_i, y_i): x_i > th and y_i > th, zip(x, y)))
    entropies = list(map(entropy, mat))
    for idx, i in tqdm(enumerate(mat), total=mat.shape[0]):
        i_H = entropies[idx]
        for jdx, j in enumerate(mat):
            if idx != jdx:
                j_H = entropies[jdx]
                if not entropy_ge or i_H >= j_H: # i more general than j
                    c_H_i_j = cross_entropy(i, j)
                    c_H_j_i = cross_entropy(j, i)
                    if c_H_j_i >= 0. and c_H_i_j >= c_H_j_i:
                        if cross_entropy_diff:
                            dep = c_H_i_j - c_H_j_i
                        else:
                            dep = c_H_i_j
                        if dep > 0:
                            prereq.append((jdx, idx, dep))
    return pd.DataFrame(columns=['t', 'prereq', 'val'], data=prereq)

In [None]:
def get_classification_report_df(t, p, *args, **kwargs):
    d = list()
    for i in classification_report(t, p, *args, **kwargs).split('\n')[1:]:
        i = i.strip()
        if i != '':
            #print(i.split())
            try:
                class_name, precision, recall, f1_score, support = i.split()
                d.append((class_name, float(precision), float(recall), float(f1_score), int(support)))
            except:
                pass
    return pd.DataFrame(columns=['class_name', 'precision', 'recall', 'f1-score', 'support'], data=d)