In [1]:
# imports
import numpy as np
from scipy.linalg import eigh
import string

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

In [2]:
# stolen from notebook9
filename = "glove.6B.50d.txt"

embeddings = {}
with open(filename,'r', encoding='utf-8') as file:
    for line in file:
        elements = line.split();
        word = elements[0];
        vector = np.asarray(elements[1:],"float32")
        embeddings[word] = vector

In [4]:
# functions

def preprocess(text):
    """
    Parameters:
    text (string): raw text from data

    Returns:
    string: string with no punctuation and
            only contains words in embeddings
    """
    no_puncuation = ""
    for char in text:
        if char not in string.punctuation:
            no_puncuation += char
    no_unknowns = ""
    for word in no_puncuation.split():
        if word in embeddings:
             no_unknowns += word.lower() + " "
    return no_unknowns

def mean_emb(text):
    """
    Parameters:
    text (string): raw text from data
    
    Returns:
    numpy.ndarray(50,): mean embedding of text
    """
    pre = preprocess(text).split()
    sum_emb = np.zeros(50,)
    for word in pre:
        sum_emb += embeddings[word]
    mean_emb = sum_emb/50
    return mean_emb

def gnb_acc(train, test, embeddings=embeddings):
    """
    Parameters:
    train (dict): texts and labels for training data
    test (dict): texts and labels for test data
    embeddings (dict): glove embeddings
    
    Returns:
    numpy.float64: accuracy score
    """
    train_emb = np.array([mean_emb(text) for text in train['texts']])
    test_emb = np.array([mean_emb(text) for text in test['texts']])

    clf = GaussianNB()
    clf.fit(train_emb, train['labels'])

    pred_labels = clf.predict(test_emb)
    acc = accuracy_score(test['labels'], pred_labels)
    return acc

def pca_from_data(train_data, n_components=2):
    train_emb = np.array([mean_emb(text) for text in train_data['texts']])
    pca = PCA(n_components=n_components)
    return pca.fit(train_emb)

In [5]:
## Spam data

In [6]:
spam_data = np.load('../spam_data.npz')
spam_train, spam_test = {}, {}
spam_train['texts'], spam_train['labels'] = spam_data['train_texts'], spam_data['train_labels']
spam_test['texts'], spam_test['labels'] = spam_data['test_texts'], spam_data['test_labels']

In [7]:
spam_gnb_acc = gnb_acc(spam_train, spam_test)

In [8]:
spam_emb_pca = pca_from_data(spam_train, n_components=2)

## News data

In [10]:
news_data = np.load('../news_data.npz')
news_train, news_test = {}, {}
news_train['texts'], news_train['labels'] = news_data['train_texts'], news_data['train_labels']
news_test['texts'], news_test['labels'] = news_data['test_texts'], news_data['test_labels']

In [11]:
news_gnb_acc = gnb_acc(news_train, news_test)

In [12]:
news_emb_pca = pca_from_data(news_train, n_components=20)

In [None]:
news_gnb_acc