In [1]:
# Package imports
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import sklearn.datasets
import sklearn.linear_model
import matplotlib
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint

categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

newsgroups_train = fetch_20newsgroups(subset='train',  categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test',  categories=categories)

In [17]:
num_train = len(newsgroups_train.data)
num_test  = len(newsgroups_test.data)

# 提取tfidf特征 TODO
vectorizer = TfidfVectorizer(max_features=20)

# 对训练和测试数据一起提取特征
X = vectorizer.fit_transform( newsgroups_train.data + newsgroups_test.data )

# 分离出训练数据和测试数据
X_train = X[0:num_train, :]
X_test = X[num_train:num_train+num_test,:]

Y_train = newsgroups_train.target
Y_test = newsgroups_test.target


In [18]:
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)
print(X_train)
print(len(X_train.toarray()))

(2034, 20) (2034,)
(1353, 20) (1353,)
  (0, 6)	0.15061975673760045
  (0, 16)	0.6389485216306471
  (0, 15)	0.19418765044565436
  (0, 19)	0.3371340528582322
  (0, 18)	0.3314213652868753
  (0, 10)	0.09947619820914738
  (0, 1)	0.23949029269098682
  (0, 0)	0.08820051463501803
  (0, 17)	0.11347691666509696
  (0, 9)	0.27805128182293154
  (0, 11)	0.11644981347701422
  (0, 8)	0.2562889435929243
  (0, 3)	0.11133077635077257
  (0, 7)	0.11599060547880218
  (0, 5)	0.09956817039037541
  (0, 12)	0.16620290450874828
  (1, 6)	0.07095837334797045
  (1, 16)	0.5267755723256666
  (1, 15)	0.3659344590203577
  (1, 19)	0.10588466616945869
  (1, 18)	0.5464749458513298
  (1, 10)	0.09372833105896626
  (1, 1)	0.11282611239100115
  (1, 0)	0.24931251447410008
  (1, 9)	0.08732836954382514
  :	:
  (2032, 0)	0.3291686056275266
  (2032, 17)	0.09411141466598104
  (2032, 9)	0.461200396540103
  (2032, 11)	0.09657696918444968
  (2032, 8)	0.1771264423463037
  (2032, 7)	0.04809806386346458
  (2032, 5)	0.20644069398705822
  (

In [19]:
# Helper function to plot a decision boundary.
# If you don't fully understand this function don't worry, it just generates the contour plot below.
def plot_decision_boundary(pred_func):
    # Set min and max values and give it some padding
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    h = 0.01
    # Generate a grid of points with distance h between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Predict the function value for the whole gid
    Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    # Plot the contour and training examples
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)

In [20]:
num_examples, input_dim = X_train.shape # 样本数量，输入层维度（target种类数）
epsilon = 0.001 # 梯度下降学习率
reg_lambda = 0.01 # 正则化强度
epochs = 5000 # 梯度下降的次数:1个epoch表示过了1遍训练集中的所有样本

In [21]:
def calculate_loss(model, X, y):
    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
    #正向传播，计算预测值
    z1 = X.dot(W1) + b1
    a1 = np.tanh(z1)
    z2 = a1.dot(W2) + b2
    exp_scores = np.exp(z2)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    # 计算损失
    corect_logprobs = -np.log(probs[range(num_examples), y])
    data_loss = np.sum(corect_logprobs)
    #在损失上加上正则项（可选）
    data_loss += reg_lambda/2 * (np.sum(np.square(W1)) + np.sum(np.square(W2)))
    return 1./num_examples * data_loss

In [22]:
# Helper function to predict an output (0 or 1)
def predict(model, x):
    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
    # 正向传播
    z1 = x.dot(W1) + b1
    a1 = np.tanh(z1)
    z2 = a1.dot(W2) + b2
    exp_scores = np.exp(z2)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    return np.argmax(probs, axis=1)

In [23]:
# 这个函数为神经网络学习参数并且返回模型
# - nn_hdim: 隐藏层的节点数
# - num_passes: 通过训练集进行梯度下降的次数
# - print_loss: 如果是True, 那么每1000次迭代就打印一次损失值
def build_model(X, y, nn_hdims, epsilon, reg_lambda, num_passes=20000,  print_loss=False):
    
    # TODO: 先只用一种nn_hdim
    nn_hdim = nn_hdims[1]
    
    # 用随机数初始化参数
    np.random.seed(0)
    nn_input_dim = int(X.shape[1])
    nn_output_dim = int(y.shape[0])
    W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim) # X(2034, 20) W1(20, nn_hdim)
    b1 = np.zeros((1, nn_hdim))
    W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
    b2 = np.zeros((1, nn_output_dim))

    model = {}
 
    # 梯度下降
    for i in range(0, num_passes):
        # 正向传播，计算判断出的结果
        z1 = X.dot(W1) + b1
        a1 = np.tanh(z1)
        z2 = a1.dot(W2) + b2
        exp_scores = np.exp(z2)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
        
        # 反向传播，对参数进行优化
        delta3 = probs
        # TODO num_examples ?= nn_input_dim
        delta3[range(num_examples), y] -= 1
        dW2 = (a1.T).dot(delta3)
        db2 = np.sum(delta3, axis=0, keepdims=True)
        delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
        dW1 = (X.T).dot(delta2)
        db1 = np.sum(delta2)
        
        # 添加正则项
        dW2 += reg_lambda * W2
        dW1 += reg_lambda * W1
        
        # 梯度下降更新参数
        W1 += -epsilon * dW1
        b1 += -epsilon * db1
        W2 += -epsilon * dW2
        b2 += -epsilon * db2
        
        # 为模型分配新参数
        model = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
        
        # 选择性打印loss
        if print_loss and i % 1000 == 0:
            print("Loss after iteration", i, calculate_loss(model, X, y))
    return model

In [24]:
nn_input_dim = int(X_train.shape[0])
nn_output_dim = int(Y_train.shape[0])
print(X_train.shape)
print(type(X_train))


(2034, 20)
<class 'scipy.sparse.csr.csr_matrix'>


In [27]:
model = build_model(X_train, Y_train, [input_dim,16,8,4], epsilon, reg_lambda, epochs, print_loss=True)

Loss after iteration 0 6.645996899251569
Loss after iteration 1000 14.62705215205021
Loss after iteration 2000 14.201031228102151
Loss after iteration 3000 14.887274985940781
Loss after iteration 4000 12.183991265682822


In [2]:
n_correct = 0
n_test = X_test.shape[0]
for n in range(n_test):
    x = X_test[n,:]
    yp = predict(model, x)
    print(yp)
    # if yp == Y_test[n] :
    #     n_correct += 1.0
print('Accuracy %f = %d / %d'%(n_correct/n_test, int(n_correct), n_test) )

NameError: name 'X_test' is not defined