In [None]:
from gensim.models import Word2Vec
from nltk.corpus import brown
b = Word2Vec(brown.sents())

In [None]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_questions, generate_missing=False):
    embeddings = clean_questions['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

In [None]:
embeddings = get_word2vec_embeddings(word2vec, df)
X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(embeddings, list_labels, 
                                                                                        test_size=0.2, random_state=40)

In [None]:
fig = plt.figure(figsize=(16, 16))          
plot_LSA(embeddings, list_labels)
plt.show()

In [None]:
clf_w2v = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', 
                         multi_class='multinomial', random_state=40)
clf_w2v.fit(X_train_word2vec, y_train_word2vec)
y_predicted_word2vec = clf_w2v.predict(X_test_word2vec)

In [None]:
accuracy_word2vec, precision_word2vec, recall_word2vec, f1_word2vec = get_metrics(y_test_word2vec, y_predicted_word2vec)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy_word2vec, precision_word2vec, 
                                                                       recall_word2vec, f1_word2vec))

In [None]:
cm_w2v = confusion_matrix(y_test_word2vec, y_predicted_word2vec)
fig = plt.figure(figsize=(10, 10))
grade_categories = ('Preschool/Pre-K','K-2','3-5', '6-8', '9-12')
plot = plot_confusion_matrix(cm_w2v, classes=grade_categories, normalize=True, title='Confusion matrix')
plt.show()
print("Word2Vec confusion matrix")
print(cm_w2v)
print("TFIDF confusion matrix")
print(cm2)
print("BoW confusion matrix")
print(cm)

In [None]:
res = []
for idy, row in enumerate(cm2):
    for idx, col in enumerate(row):
        res.append({'x': idx, 'y': idy, 'color': 'rbg(0, 0, {})'.format(col * 255 / 321)})
res

In [None]:
from lime import lime_text
from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer

X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(list_corpus, list_labels, test_size=0.2, 
                                                                                random_state=40)
vector_store = word2vec
def word2vec_pipeline(examples):
    global vector_store
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized_list = []
    for example in examples:
        example_tokens = tokenizer.tokenize(example)
        vectorized_example = get_average_word2vec(example_tokens, vector_store, generate_missing=False, k=300)
        tokenized_list.append(vectorized_example)
    return clf_w2v.predict_proba(tokenized_list)

c = make_pipeline(count_vectorizer, clf)

In [None]:
def explain_one_instance(instance, class_names):
    explainer = LimeTextExplainer(class_names=class_names)
    exp = explainer.explain_instance(instance, word2vec_pipeline, num_features=6, labels=[0, 1, 2, 3, 4])
    return exp

def visualize_one_exp(features, labels, index, class_names = ('Preschool/Pre-K','K-2','3-5', '6-8', '9-12',)):
    exp = explain_one_instance(features[index], class_names = class_names)
    print('Index: %d' % index)
    print('True class: %s' % class_names[labels[index]])
    exp.show_in_notebook(text=True)

In [None]:
visualize_one_exp(X_test_data, y_test_data, 63)

In [None]:
visualize_one_exp(X_test_data, y_test_data, 60)

In [None]:
import random
from collections import defaultdict

random.seed(40)

def get_statistical_explanation(test_set, sample_size, word2vec_pipeline, label_dict):
    sample_sentences = random.sample(test_set, sample_size)
    explainer = LimeTextExplainer()
    
    labels_to_sentences = defaultdict(list)
    contributors = defaultdict(dict)
    
    # First, find contributing words to each class
    for sentence in sample_sentences:
        probabilities = word2vec_pipeline([sentence])
        curr_label = probabilities[0].argmax()
        labels_to_sentences[curr_label].append(sentence)
        exp = explainer.explain_instance(sentence, word2vec_pipeline, num_features=6, labels=[curr_label])
        listed_explanation = exp.as_list(label=curr_label)
        
        for word,contributing_weight in listed_explanation:
            if word in contributors[curr_label]:
                contributors[curr_label][word].append(contributing_weight)
            else:
                contributors[curr_label][word] = [contributing_weight]    
    
    # average each word's contribution to a class, and sort them by impact
    average_contributions = {}
    sorted_contributions = {}
    for label,lexica in contributors.items():
        curr_label = label
        curr_lexica = lexica
        average_contributions[curr_label] = pd.Series(index=curr_lexica.keys())
        for word,scores in curr_lexica.items():
            average_contributions[curr_label].loc[word] = np.sum(np.array(scores))/sample_size
        detractors = average_contributions[curr_label].sort_values()
        supporters = average_contributions[curr_label].sort_values(ascending=False)
        sorted_contributions[label_dict[curr_label]] = {
            'detractors': detractors,
            'supporters': supporters
        }
    return sorted_contributions

label_to_text = {
    0: 'Preschool/Pre-K',
    1: 'K-2',
    2: '3-5',
    3: '6-8',
    4: '9-12',
}
sorted_contributions = get_statistical_explanation(X_test_data, 5, word2vec_pipeline, label_to_text)

In [None]:
def plot_important_words_w2v(top_scores, top_words, bottom_scores, bottom_words, name, grade_category):
    y_pos = np.arange(len(top_words))

    fig = plt.figure(figsize=(10, 10))          
    plt.subplot(121)
    plt.barh(y_pos,bottom_scores, align='center', alpha=0.5)
    plt.title('Irrelevant to\n{}'.format(grade_category))
    plt.yticks(y_pos, bottom_words)
    plt.suptitle('Key words', fontsize=16)
    plt.xlabel('Importance')
    
    plt.subplot(122)
    plt.barh(y_pos,top_scores, align='center', alpha=0.5)
    plt.title('Relevant to\n{}'.format(grade_category))
    plt.yticks(y_pos, top_words)
    plt.suptitle(name, fontsize=16)
    plt.xlabel('Importance')
    
    plt.subplots_adjust(wspace=0.8)
    plt.show()

grade_category = '9-12'
top_words = sorted_contributions[grade_category]['supporters'][:10].index.tolist()
top_scores = sorted_contributions[grade_category]['supporters'][:10].tolist()
bottom_words = sorted_contributions[grade_category]['detractors'][:10].index.tolist()
bottom_scores = sorted_contributions[grade_category]['detractors'][:10].tolist()

plot_important_words_w2v(top_scores,
                         top_words,
                         bottom_scores,
                         bottom_words,
                         "Most important words for relevance",
                         grade_category=grade_category)