In [1]:
# load various models from scikit-learn's library
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

# also get some metrics to try
from sklearn.metrics import r2_score, accuracy_score

from sklearn.datasets import fetch_20newsgroups_vectorized, fetch_20newsgroups

from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer

import re

import numpy as np

In [2]:
ng_train_raw = fetch_20newsgroups(subset="train", remove=("headers", "footers", "quotes"))
ng_test_raw = fetch_20newsgroups(subset="test", remove=("headers", "footers", "quotes"))

In [3]:
def vectorize(vectorizer):
    vectorizer.fit(ng_train_raw.data)

    ng_train_x = vectorizer.transform(ng_train_raw.data)
    ng_test_x = vectorizer.transform(ng_test_raw.data)

    return {"X" : ng_train_x, "y" : ng_train_raw.target}, {"X" : ng_test_x, "y" : ng_test_raw.target}
    

In [11]:
list_of_models = {"LogisticRegression" : LogisticRegression(solver = "lbfgs", n_jobs = -1), 
                  "RandomForestClassifier" : RandomForestClassifier(n_jobs = -1), 
                  "KNeighborsClassifier" : KNeighborsClassifier(), 
                  "GradientBoostingClassifier" : GradientBoostingClassifier(),
                  "MLPClassifier" : MLPClassifier(),
                  "MLPClassifier x2" : MLPClassifier(hidden_layer_sizes=(100, 100, ))
                 }

In [None]:
def model_eval(model):
    model.fit(ng_train["X"], ng_train["y"])
    train_score = accuracy_score(y_true = ng_train["y"], y_pred = model.predict(X=ng_train["X"]))
    print("Train score {0}:".format(train_score))
    test_score = accuracy_score(y_true = ng_test["y"], y_pred = model.predict(X=ng_test["X"]))
    print("Test score {0}:".format(test_score))
    print()

In [None]:
def preprocessor(x):
    return re.sub(r"[ ]+", " ", re.sub(r"[^\w]+", " ", x))

ng_train, ng_test = vectorize(TfidfVectorizer(max_features = 50000, preprocessor=lambda x: preprocessor(x)))

for model in list_of_models:
    model_eval(model)

ng_train, ng_test = vectorize(TfidfVectorizer(max_features = 50000, preprocessor=lambda x: preprocessor(x), ngram_range = [2, 2]))

for model in list_of_models:
    model_eval(model)

# words/ngrams with hashing
ng_train, ng_test = vectorize(HashingVectorizer(n_features = 50000, preprocessor=lambda x: preprocessor(x)))

for model in list_of_models:
    model_eval(model)

In [None]:
# words/ngrams with hashing
ng_train, ng_test = vectorize(HashingVectorizer(n_features = 50000, ngram_range = [2, 2]))

for model in list_of_models:
    model_eval(model)

In [5]:
ng_train, ng_test = vectorize(HashingVectorizer(stop_words='english', n_features=2**18))

model = MLPClassifier(hidden_layer_sizes=(100,), activation="identity")
model.fit(ng_train["X"], ng_train["y"])
train_score = accuracy_score(y_true = ng_train["y"], y_pred = model.predict(X=ng_train["X"]))
print("Train score {0}:".format(train_score))
test_score = accuracy_score(y_true = ng_test["y"], y_pred = model.predict(X=ng_test["X"]))
print("Test score {0}:".format(test_score))
print()

Train score 0.973660951034117:
Test score 0.6825544344131704:



In [6]:
embeddings = model.coefs_[0]

print(embeddings.shape)
print(embeddings)

(262144, 100)
[[ 3.69646000e-02 -1.09253902e-01  9.03193336e-02 ... -2.23141568e-02
  -9.29570301e-02 -1.48118969e-01]
 [ 4.96444005e-69  9.68094341e-69 -5.11246418e-67 ... -6.45638112e-67
   5.67443303e-68 -3.77074294e-67]
 [ 3.32964781e-68 -1.16161984e-68  5.18097787e-67 ...  6.52557179e-67
   5.76763022e-67  1.35773091e-68]
 ...
 [ 1.13088042e-01  1.16625357e-01 -1.65920495e-01 ... -1.18982418e-01
   2.33227120e-02  5.99042944e-02]
 [-2.07373212e-67  3.49972255e-67 -5.68474464e-67 ... -3.63965325e-67
  -3.60479684e-67 -9.59490359e-68]
 [-8.92083087e-69 -3.42089809e-68  1.82125711e-68 ... -6.15112787e-67
  -5.52895534e-67 -1.45803300e-67]]


In [7]:
print(embeddings.shape)
print(ng_test["X"].todense().shape)

train_split = np.array_split(ng_train["X"].todense(), 1000)

test_split = np.array_split(ng_test["X"].todense(), 1000)

(262144, 100)
(7532, 262144)


In [8]:
#x_train_embed = np.dot(ng_train["X"], embeddings)
#x_test_embed = np.dot(ng_test["X"], embeddings)

x_train_embed = np.vstack([np.dot(i, embeddings) for i in train_split])
x_test_embed = np.vstack([np.dot(i, embeddings) for i in test_split])

In [9]:
for name, model in list_of_models.items():
    model.fit(x_train_embed, ng_train["y"])
    train_score = accuracy_score(y_true = ng_train["y"], y_pred = model.predict(X=x_train_embed))
    print("{0} train score {1}:".format(name, train_score))
    test_score = accuracy_score(y_true = ng_test["y"], y_pred = model.predict(X=x_test_embed))
    print("{0} test score {1}:".format(name, test_score))
    print()

NameError: name 'list_of_models' is not defined

In [None]:
import Numpy_w_Classes as nwc

In [None]:
ng_train, ng_test = vectorize(HashingVectorizer(n_features = 50000, ngram_range = [2, 2], preprocessor=lambda x: preprocessor(x)))

In [None]:
set(ng_train["y"])

In [None]:
lr = 0.001

train_x = ng_train["X"]
test_x = ng_test["X"]

train_labels = ng_train["y"]
test_labels = ng_test["y"]

train_y = np.eye(20)[ng_train["y"]]
test_y = np.eye(20)[ng_test["y"]]

epochs = 20
batch_size = 10
batch_pos = list(range(0, train_x.shape[0] - 1, batch_size))
batch_amount = len(batch_pos)

In [None]:
NNet = nwc.NeuralNetwork()

NNet.layers.append(nwc.InputLayer(input_size=50000))
NNet.layers.append(nwc.DenseLayer(predecessor=NNet.layers[-1], hidden=100, use_bias=False, positive_params=False))
#NNet.layers.append(nwc.BatchNormLayer(predecessor=NNet.layers[-1]))
NNet.layers.append(nwc.DenseLayer(predecessor=NNet.layers[-1], hidden=50, use_bias=False, positive_params=False))
NNet.layers.append(nwc.BatchNormLayer(predecessor=NNet.layers[-1]))
NNet.layers.append(nwc.ReLUActivation(predecessor=NNet.layers[-1]))
NNet.layers.append(nwc.DropoutLayer(predecessor=NNet.layers[-1], probability=0.75))
NNet.layers.append(nwc.SoftmaxCrossEntropy(predecessor=NNet.layers[-1], hidden=20, use_bias=True))

NNet.optimizer = nwc.AdamOptimizer(NNet.layers)

In [None]:
for ep in range(1, epochs+1):
    batch_num = 1
    for b in batch_pos:
        batch_x = train_x[b:b+batch_size]
        batch_y = train_y[b:b+batch_size]
        batch_labels = train_labels[b:b+batch_size]
        NNet.feed_forward(batch_x)
        NNet.back_propagation(batch_y)
        NNet.optimizer.step(lr/batch_size)
        NNet.zero_gradients()
    train_predicted = NNet.evaluate(train_x)
    test_predicted = NNet.evaluate(test_x)
    print("Epoch {:3d} stats: lr {:7.5f}, batch size {:3d}, validation loss {:6.2f}".format(ep, lr, batch_size, 
        nwc.CrossEntropy(test_predicted, test_y)))
    print("  training accuracy {:6.2f}%, validation accuracy {:6.2f}%".format(
        100 * nwc.accuracy(train_predicted, train_labels), 100 * nwc.accuracy(test_predicted, test_labels)))
    #lr *= 0.8
    p = np.random.permutation(train_x.shape[0])
    train_x = train_x[p,:]
    train_labels = train_labels[p]
    train_y = train_y[p,:]