In [0]:
import numpy as np
import re
import scipy
import sklearn.datasets
from scipy.sparse import spmatrix, coo_matrix

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

In [0]:
def clean_Up(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def load_data():
    # Load data from files
    positive_examples = list(open("/rt-polarity.pos", encoding='latin-1').readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open("/rt-polarity.neg", encoding='latin-1').readlines())
    negative_examples = [s.strip() for s in negative_examples]
    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_Up(sent) for sent in x_text]
#     x_text = [s.split(" ") for s in x_text]
    # Generate labels
    positive_labels = [1 for _ in positive_examples]
    negative_labels = [0 for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]


In [6]:
data,labels=load_data()
x_train,x_valid,y_train,y_valid=sklearn.model_selection.train_test_split(data,labels,train_size=0.9,shuffle=True)



In [7]:
len(x_train)

9595

In [0]:
vectorizer=CountVectorizer(ngram_range=(1,2),binary=True)
X=vectorizer.fit_transform(x_train)
X_valid=vectorizer.transform(x_valid)

In [9]:
X.shape

(9595, 114907)

In [0]:
def NBSVM(X,X_valid,y,y_valid):
    p = np.asarray(1 + X[y == 1].sum(axis=0)).flatten()
    q = np.asarray(1 + X[y == 0].sum(axis=0)).flatten()
    r = np.log(p/np.abs(p).sum()) - np.log(q/np.abs(q).sum())
    b = np.log((y == 1).sum()) - np.log((y == 0).sum())
    
    indices = np.arange(len(r))
    r_sparse = coo_matrix(
        (r, (indices, indices)),
        shape=(len(r), len(r))
    )
    X_scaled = X*r_sparse
    lsvc = LinearSVC(penalty='l2',dual=True, tol=1,C=1,max_iter=10000)
    lsvc.fit(X_scaled,y)
    pred=lsvc.predict(X_scaled)
    print("SVM train: ", sklearn.metrics.accuracy_score(y, pred))
    y_pred=lsvc.predict(X_valid*r_sparse)
    print ("SVM Accuracy: ", sklearn.metrics.accuracy_score(y_valid, y_pred))

In [12]:
import time
start=time.time();
NBSVM(X,X_valid,y_train,y_valid)
print("NBSVM runtime: ", time.time()-start)

SVM train:  1.0
SVM Accuracy:  0.8078725398313027
NBSVM runtime:  0.09062004089355469
