In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
import pandas as pd
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
import time

In [2]:
# download data
url = 'https://raw.githubusercontent.com/chiphuyen/tf-stanford-tutorials/master/data/'

def maybe_download(filename):
    if not os.path.exists(filename):
        filename, _ = urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    return filename

filename = maybe_download('heart.csv')

In [3]:
# Pre-treatment
raw_data = pd.read_csv('heart.csv')


label = raw_data['chd']

data = raw_data.loc[:,raw_data.columns[:9]].replace(['Present', 'Absent'],[0, 1])
data = data.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)), axis=0)

all_data = data
all_data['chd'] = label
all_data.describe()

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
count,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0
mean,-2.6914500000000003e-17,-4.20539e-18,4.998407e-17,4.277483e-17,4.998407e-17,-1.8263410000000002e-17,8.026288000000001e-17,3.1720660000000005e-17,9.612321e-19,0.34632
std,0.1751822,0.1472123,0.1443142,0.2176419,0.4933567,0.151039,0.1321732,0.1663228,0.298142,0.476313
min,-0.3190328,-0.1165272,-0.2620435,-0.5221463,-0.5844156,-0.616983,-0.3558379,-0.1157986,-0.5676738,0.0
25%,-0.1224516,-0.1148445,-0.1015906,-0.157531,-0.5844156,-0.09390609,-0.09595711,-0.1123337,-0.2411432,0.0
50%,-0.03698154,-0.05242466,-0.02789719,0.0198117,0.4155844,-0.001598402,-0.007500394,-0.0647761,0.04457108,0.0
75%,0.08267658,0.05975483,0.07314811,0.1628187,0.4155844,0.1060939,0.07695695,0.04652562,0.2486527,1.0
max,0.6809672,0.8834728,0.7379565,0.4778537,0.4155844,0.383017,0.6441621,0.8842014,0.4323262,1.0


In [4]:
#重组数据集，保证label数量相等
one_label_result = all_data[(all_data.chd == 1)]
zero_label_result = all_data[(all_data.chd == 0)]

one_label_length = len(one_label_result)
zero_label_length = len(zero_label_result)

small_len = one_label_length if one_label_length < zero_label_length else zero_label_length;


one_index = random.sample(list(one_label_result.index.values), small_len)
zero_index = random.sample(list(zero_label_result.index.values), small_len)

new_data = pd.concat([one_label_result.ix[one_index], zero_label_result.ix[zero_index]])
new_data.describe()

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
count,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0
mean,0.011576,0.011307,0.012924,0.020443,-0.043791,0.000469,0.004143,-0.000917,0.038321,0.5
std,0.182235,0.157997,0.149705,0.208793,0.499127,0.154374,0.128913,0.167212,0.284418,0.500783
min,-0.319033,-0.116527,-0.255772,-0.511517,-0.584416,-0.616983,-0.355838,-0.115799,-0.567674,0.0
25%,-0.122452,-0.107553,-0.084692,-0.120748,-0.584416,-0.093906,-0.085292,-0.113285,-0.179919,0.0
50%,-0.019888,-0.042489,-0.008385,0.030721,0.415584,0.006094,-0.008285,-0.068173,0.095591,0.5
75%,0.116865,0.066165,0.082382,0.180581,0.415584,0.106094,0.080643,0.041991,0.269061,1.0
max,0.680967,0.883473,0.737956,0.477854,0.415584,0.383017,0.617186,0.884201,0.432326,1.0


In [5]:
# 数据分10份，拿一份做测试集，九份做训练集
train_data_size = int(small_len * 2 * 0.9)
test_data_size = int(small_len * 2 * 0.1)

train_data_index = random.sample(list(new_data.index.values), train_data_size)
train_data = new_data.ix[train_data_index]

test_data_index = list(set(new_data.index.values).difference(set(train_data_index)))
test_data = new_data.ix[test_data_index]

train_label = train_data['chd']
train_data = train_data.loc[:,raw_data.columns[:9]]

test_label = test_data['chd']
test_data = test_data.loc[:,raw_data.columns[:9]]
print(train_data.shape, train_label.shape)
print(test_data.shape, test_label.shape)

(288, 9) (288,)
(32, 9) (32,)


In [53]:
# Define paramaters for the model
learning_rate = 0.01
batch_size = 16
n_epochs = 100

In [54]:
X = tf.placeholder(dtype = np.float32, shape = [batch_size, 9], name='X')
Y = tf.placeholder(dtype = np.float32, shape = [batch_size, 2], name='Y')

W = tf.Variable(tf.random_normal([9, 2]), name='W')
b = tf.Variable(tf.random_normal([batch_size, 2]), name='b')

logits = tf.matmul(X, W) + b

entropy = tf.nn.softmax_cross_entropy_with_logits(labels = Y, logits = logits)

loss = tf.reduce_mean(entropy)

optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)

def to_one_hotting(labels):
    return (np.arange(2) == labels[:,None]).astype(np.float32)

with tf.Session() as sess:
    start_time = time.time()
    sess.run(tf.global_variables_initializer())	
    n_batches = int(len(train_data)/batch_size)
    for i in range(n_epochs): 
        total_loss = 0

        for index in range(n_batches):
            X_batch = train_data[index*batch_size:(index+1)*batch_size].values
            Y_batch = train_label[index*batch_size:(index+1)*batch_size].values
            # Y_batch = np.mat(train_label[index*batch_size:(index+1)*batch_size].values).T
            Y_batch = to_one_hotting(Y_batch)
            _, loss_batch, get_entropy, get_logits = sess.run([optimizer, loss, entropy, logits], feed_dict={X: X_batch, Y: Y_batch})
            total_loss += loss_batch
        print('Average loss epoch :{0}'.format(total_loss/n_batches))

    print('Total time: {0} seconds'.format(time.time() - start_time))
    print('loss_batch为啥都是0')
#     print('b:',b.eval())
#     print('W:',W.eval())

    print('Optimization Finished!')

    # test the model
    n_batches = int(len(test_data)/batch_size)
    total_correct_preds = 0
    for index in range(n_batches):
        X_batch = test_data[index*batch_size:(index+1)*batch_size].values
        Y_batch = test_label[index*batch_size:(index+1)*batch_size].values
        Y_batch = to_one_hotting(Y_batch)
        _, loss_batch, logits_batch = sess.run([optimizer, loss, logits], feed_dict={X: X_batch, Y:Y_batch}) 
        preds = tf.nn.softmax(logits_batch)
        correct_preds = tf.equal(tf.argmax(preds, 1), tf.argmax(Y_batch, 1))
        accuracy = tf.reduce_sum(tf.cast(correct_preds, tf.float32)) # need numpy.count_nonzero(boolarr) :(
        total_correct_preds += sess.run(accuracy)

    print('Accuracy:',format(total_correct_preds/len(test_data)))
    print('写完想哭，我去看答案了')
    print（'我觉得造成loss为0的原因是softmax_cross_entropy_with_logits这个函数是针对one-hotting变量的，可以去了解下这个函数具体是怎么算的，或者使用其它的损失函数试试看'）

Average loss epoch :0.8611748549673293
Average loss epoch :0.8585713737540774
Average loss epoch :0.8560004731019338
Average loss epoch :0.8534615205393897
Average loss epoch :0.8509540657202402
Average loss epoch :0.8484774927298228
Average loss epoch :0.8460312452581193
Average loss epoch :0.8436148166656494
Average loss epoch :0.8412276771333482
Average loss epoch :0.8388693001535203
Average loss epoch :0.8365391592184702
Average loss epoch :0.8342368106047312
Average loss epoch :0.8319617013136545
Average loss epoch :0.8297134074899886
Average loss epoch :0.8274914258056216
Average loss epoch :0.8252953555848863
Average loss epoch :0.8231247034337785
Average loss epoch :0.820979012383355
Average loss epoch :0.818857858578364
Average loss epoch :0.816760841343138
Average loss epoch :0.814687493774626
Average loss epoch :0.812637468179067
Average loss epoch :0.8106103473239474
Average loss epoch :0.8086057305335999
Average loss epoch :0.8066232336892022
Average loss epoch :0.80466249