In [1]:
from __future__ import print_function, division
from graph_tool.all import *
%pylab inline
import numpy as np
import pandas as pd
import seaborn as sns
from bokeh.plotting import *
import random

from bokeh.io import output_notebook
output_notebook()
from bokeh.charts import Scatter
from bokeh.models import HoverTool, ColumnDataSource 
from collections import Counter, defaultdict
from operator import itemgetter
from textstat.textstat import textstat
from termcolor import colored
import os
import copy
import re
import joblib

from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense, Flatten, Lambda
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.normalization import BatchNormalization
from keras.layers.embeddings import Embedding
from keras.layers.core import Reshape
from keras.layers.pooling import MaxPooling2D, AveragePooling2D
from keras.engine import Input
from keras.layers import Merge
from keras.layers import merge
from keras.layers import LSTM, GRU
from keras.models import Model
from keras.utils.np_utils import to_categorical
from keras.layers.convolutional import Convolution2D
from keras.regularizers import WeightRegularizer
from keras.optimizers import RMSprop, SGD
from keras.regularizers import l2, activity_l2, activity_l1, l1l2, l1
from keras.callbacks import EarlyStopping
from keras import backend as K

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedShuffleSplit, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE

from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

from tqdm import tqdm, tqdm_pandas
import string
allowed_chars = set(string.ascii_lowercase) | {' '}
from wordcloud import WordCloud

import warnings
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


Using Theano backend.


In [None]:
def remove_text_inside_brackets(text):
    return re.sub(r'\[[^\]]*\]', '', text)

In [2]:
def text_cleanup(text, filter_w=None, min_len=None):
    if isinstance(text, float):
        return ''
    text = text.lower()
    text = text.replace('videolectures net', ' ')
    text = text.replace(' nan ', ' ')
    #text = ' '.join((stemmer.stem(i.decode('utf-8')) for i in text.split()))
    if text == 'nan':
        text = ''
    text = remove_text_inside_brackets(text)
    text = ''.join([i if i in allowed_chars else ' ' for i in text])
    if filter_w is not None:
        text = [i for i in text.split() if i in filter_w]
        text = ' '.join(text)
    if min_len is not None:
        text = ' '.join(i for i in text.split() if len(i) >= min_len)
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    while '  ' in text:
        text = text.replace('  ', ' ')
    return text

In [3]:
def gen_weights(y):
    w = np.zeros(len(y))
    w_pos = w > 0
    w_neg = w < 1
    w[w_pos] = len(y)/2./len(w_pos)
    w[w_neg] = len(y)/2./len(w_neg)
    return w

In [4]:
def plot_nn_hist(hist, secondary_y=False):
    data = zip(hist.history['loss'], hist.history['val_loss'])
    hist_df = pd.DataFrame(columns=['train', 'val'], data=data)
    hist_df.plot(y=['train', 'val'], secondary_y=['val'] if secondary_y else None)
    plt.show()

In [5]:
def plot_prediction_violins(y_true, y_pred):
    pred = pd.DataFrame(columns=['prediction', 'true'], data=zip(y_pred, y_true))
    sns.violinplot(x='true', y='prediction', data=pred)
    plt.show()

In [6]:
class Digitizer:
    def __init__(self):
        self.word_dict = defaultdict(lambda: len(self.word_dict))
        self.word_dict[0] = None
    def series_digitizer(self, text_series, max_len=-1):
        digitzed_ser = text_series.apply(self.digitize, args=(max_len,)).astype('object')
        max_len = digitzed_ser.apply(len).max()
        digitzed_ser = digitzed_ser.apply(lambda x: x if len(x) == max_len else x + [0] * (max_len - len(x)))
        return digitzed_ser
        
    def digitize(self, text, max_len=-1):
        if max_len < 0:
            return [self.word_dict[i] for i in text.split()]
        else:
            return [self.word_dict[i] for i in text.split()[:max_len]]
    def num_words(self):
        return len(self.word_dict)

In [7]:
def score(y_true, y_pred, classes, score_f):
    scores = list()
    for i in classes:
        t = y_true == i
        p = y_pred == i
        scores.append(score_f(t, p))
    return scores

In [8]:
def evaluate(y_true, y_pred):
    data = list()
    y_pred_int = np.round(y_pred).astype('int')
    classes = sorted(set(y_true))
    data.extend([('f1(' + str(i) + ')', s) for i,s in zip(classes, score(y_true, y_pred_int, classes, f1_score))])
    data.extend([('precision(' + str(i) + ')', s) for i,s in zip(classes, score(y_true, y_pred_int, classes, precision_score))])
    data.extend([('recall(' + str(i) + ')', s) for i,s in zip(classes, score(y_true, y_pred_int, classes, recall_score))])
    return data

In [None]:
def viz_res(data, title=''):
    df = pd.DataFrame(columns=['type', 'value'], data=data)
    sns.barplot(x='type', y='value', data=df)
    plt.xticks(rotation=90)
    plt.title(title)
    plt.show()

In [None]:
def plot_tfidf_vals(vectorized_text):
    tfidf_vals = pd.DataFrame(data=vectorized_text.copy().ravel())
    tfidf_vals = tfidf_vals[tfidf_vals[0] > 0]
    tfidf_vals.plot(kind='hist', bins=200, logy=True)
    plt.show()
    tfidf_vals = pd.DataFrame(data=vectorized_text.max(axis=1).ravel())
    sns.distplot(tfidf_vals)
    plt.title('max tfidf vals per doc')
    plt.show()
    tfidf_vals = pd.DataFrame(data=vectorized_text.max(axis=0).ravel())
    sns.distplot(tfidf_vals)
    plt.title('max tfidf vals per term')
    plt.show()

In [None]:
def filter_tfidf_max(tfidf, words, min_th=None, max_th=None):
    tfidf = tfidf.copy()
    words = np.array(words)
    if min_th is not None:
        keep = tfidf.max(axis=0) >= min_th
        try:
            keep = keep.flatten()
        except:
            keep = np.array(keep.todense()).flatten()
        words = words[keep]
        tfidf = tfidf[:, keep]
    if max_th is not None:
        keep = tfidf.max(axis=0) <= max_th
        try:
            keep = keep.flatten()
        except:
            keep = np.array(keep.todense()).flatten()
        words = words[keep]
        tfidf = tfidf[:, keep]
    assert len(words) == tfidf.shape[1]
    return tfidf, words

In [9]:
def filter_word_rep(tfidf, words):
    words = np.array(words)
    bidx = np.array([True if len(w) == 1 or len(set(w)) == len(w) else False for w in map(lambda x: x.split(), words)], 
                   dtype=bool)
    words = words[bidx]
    tfidf = tfidf[:, bidx]
    return tfidf, words

In [10]:
def wordcloud(text_freq, **kwargs):
    wordcloud = WordCloud(**kwargs).generate_from_frequencies(text_freq)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

In [None]:
def get_prereq_graph(title_topic, topic_prereq, draw=True):
    g = Graph(directed=True)
    v_dict = defaultdict(g.add_vertex)
    
    topic_to_titles = defaultdict(list)
    for title, topic in title_topic:
        topic_to_titles[topic].append(title)

    topic_prereq_dict = defaultdict(list)
    for topic, prereq in topic_prereq:
        topic_prereq_dict[topic].append(prereq)
    v_text = g.new_vertex_property('string')
    v_topic = g.new_vertex_property('int')
    e_c = g.new_edge_property('float')
    for title, topic in title_topic:
        v = v_dict[title]
        v_text[v] = title
        v_topic[v] = topic
        prereq = topic_prereq_dict[topic]
        for p in prereq:
            p_titles = topic_to_titles[p]
            for p_t in p_titles:
                p_v = v_dict[p_t]
                e = g.add_edge(p_v, v)
                e_c[e] = p
    print(g)
    if draw:
        #pos = arf_layout(g)
        pos = sfdp_layout(g, groups=v_topic, C=4, p=3, mu_p=.9)
        g.vp['pos'] = pos
        deg = g.degree_property_map('out')
        deg.a = 4 * (np.sqrt(deg.a) * 0.5 + 0.4)
        g.set_reversed(True)
        pr = pagerank(g)
        g.set_reversed(False)

        ebet = betweenness(g)[1]
        ebet.a /= ebet.a.max()
        ebet.a *= 10.
        e_c.a /= e_c.a.max()
        v_c = g.new_vertex_property('float')
        v_c.a = v_topic.a.astype('float')
        v_c.a /= v_c.a.max()
        graph_draw(g, pos, output_size=(15000, 15000), 
                   vertex_text=v_text, inline=True, 
                   edge_color=e_c,
                   edge_pen_width=ebet,
                   output='prereq_graph.png', vorder=deg,
                   vertex_fill_color=v_c, marker_size=50, bg_color=[1.,1., 1., 1.])
    return g

In [None]:
def plot_topic_coocurrence(transformed):
    cooc = transformed.T.dot(transformed)
    np.fill_diagonal(cooc, 0)
    sns.heatmap(cooc)
    #plt.grid('off')
    plt.show()