# TensorFlow로 파일에서 데이터 읽어오기

## Loading data from file

In [1]:
import numpy as np

xy = np.loadtxt("data-01-test-score.csv", delimiter = ",", dtype = np.float32)
x_data = xy[:, 0:-1]
y_data = xy[:, [-1]]


# Make sure the shape and data are OK
print(x_data.shape, x_data, len(x_data))
print(y_data.shape, y_data, len(y_data))

(25, 3) [[ 73.  80.  75.]
 [ 93.  88.  93.]
 [ 89.  91.  90.]
 [ 96.  98. 100.]
 [ 73.  66.  70.]
 [ 53.  46.  55.]
 [ 69.  74.  77.]
 [ 47.  56.  60.]
 [ 87.  79.  90.]
 [ 79.  70.  88.]
 [ 69.  70.  73.]
 [ 70.  65.  74.]
 [ 93.  95.  91.]
 [ 79.  80.  73.]
 [ 70.  73.  78.]
 [ 93.  89.  96.]
 [ 78.  75.  68.]
 [ 81.  90.  93.]
 [ 88.  92.  86.]
 [ 78.  83.  77.]
 [ 82.  86.  90.]
 [ 86.  82.  89.]
 [ 78.  83.  85.]
 [ 76.  83.  71.]
 [ 96.  93.  95.]] 25
(25, 1) [[152.]
 [185.]
 [180.]
 [196.]
 [142.]
 [101.]
 [149.]
 [115.]
 [175.]
 [164.]
 [141.]
 [141.]
 [184.]
 [152.]
 [148.]
 [192.]
 [147.]
 [183.]
 [177.]
 [159.]
 [177.]
 [175.]
 [175.]
 [149.]
 [192.]] 25


__cf. Slicing__

In [7]:
nums = range(5)     # range is a built-in function that creats a list of integers
print(nums)
print(nums[2:4])
print(nums[2:])
print(nums[:2])
print(nums[:])
print(nums[:-1])

range(0, 5)
range(2, 4)
range(2, 5)
range(0, 2)
range(0, 5)
range(0, 4)


In [8]:
nums[2:4] = [8, 9]
print(nums)

TypeError: 'range' object does not support item assignment

__cf. Indexing, Slicing, Iterating__

* Arrays can be indexed, sliced, iterated much like lists and other squence types in Python
* As with Python lists, slicing in Numpy can be accomplished with the colon(:) syntax
* Colon instances(:) can be replaced with dots(...)

>Indexing, Slicing, Iterating in __list__

In [13]:
a = np.array([1, 2, 3, 4, 5])

a[1:3]

array([2, 3])

In [14]:
a[-1]

5

In [15]:
a[0:2] = 9
a

array([9, 9, 3, 4, 5])

>Indexing, Slicing, Iterating in __matrix__

In [17]:
b = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
b

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12]])

In [18]:
b[:, 1]

array([ 2,  6, 10])

In [19]:
b[-1]

array([ 9, 10, 11, 12])

In [20]:
b[-1, :]

array([ 9, 10, 11, 12])

In [21]:
b[-1, ...]

array([ 9, 10, 11, 12])

In [22]:
b[0:2, :]

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

### Full Code

In [23]:
import tensorflow as tf
import numpy as np
tf.set_random_seed(777)     # for reproducibility

In [25]:
xy = np.loadtxt("data-01-test-score.csv", delimiter = ",", dtype = np.float32)
x_data = xy[:, 0:-1]
y_data = xy[:, [-1]]


# Make sure the shape and data are OK
print(x_data.shape, x_data, len(x_data))
print(y_data.shape, y_data, len(y_data))


# Placeholders for a tensor that will be always fed
X = tf.placeholder(tf.float32, shape = [None, 3])
Y = tf.placeholder(tf.float32, shape = [None, 1])

W = tf.Variable(tf.random_normal([3, 1]), name = "weight")
b = tf.Variable(tf.random_normal([1]), name = "bias")


# Hypothesis
hypothesis = tf.matmul(X, W) + b


# Simplified cost/loss function
cost = tf.reduce_mean(tf.square(hypothesis - Y))


# Minimize
optimizer = tf.train.GradientDescentOptimizer(learning_rate = 1e-5)
train = optimizer.minimize(cost)

(25, 3) [[ 73.  80.  75.]
 [ 93.  88.  93.]
 [ 89.  91.  90.]
 [ 96.  98. 100.]
 [ 73.  66.  70.]
 [ 53.  46.  55.]
 [ 69.  74.  77.]
 [ 47.  56.  60.]
 [ 87.  79.  90.]
 [ 79.  70.  88.]
 [ 69.  70.  73.]
 [ 70.  65.  74.]
 [ 93.  95.  91.]
 [ 79.  80.  73.]
 [ 70.  73.  78.]
 [ 93.  89.  96.]
 [ 78.  75.  68.]
 [ 81.  90.  93.]
 [ 88.  92.  86.]
 [ 78.  83.  77.]
 [ 82.  86.  90.]
 [ 86.  82.  89.]
 [ 78.  83.  85.]
 [ 76.  83.  71.]
 [ 96.  93.  95.]] 25
(25, 1) [[152.]
 [185.]
 [180.]
 [196.]
 [142.]
 [101.]
 [149.]
 [115.]
 [175.]
 [164.]
 [141.]
 [141.]
 [184.]
 [152.]
 [148.]
 [192.]
 [147.]
 [183.]
 [177.]
 [159.]
 [177.]
 [175.]
 [175.]
 [149.]
 [192.]] 25
Instructions for updating:
Colocations handled automatically by placer.


### Output

In [27]:
# Launch the graph in a session
sess = tf.Session()


# Initialize global variables in the graph
sess.run(tf.global_variables_initializer())


# Set up feed_dict variables inside the loop
for step in range(2001):
    cost_val, hy_val, _ = sess.run([cost, hypothesis, train], feed_dict = {X: x_data, Y: y_data})
    
    if step % 10 == 0:
        print(step, "Cost:", cost_val, "\nPrediction:\n", hy_val)

0 Cost: 21027.0 
Prediction:
 [[22.048063 ]
 [21.619787 ]
 [24.096693 ]
 [22.293005 ]
 [18.633902 ]
 [ 7.2669735]
 [12.33103  ]
 [ 3.150511 ]
 [14.347944 ]
 [ 4.2534237]
 [14.485708 ]
 [10.678068 ]
 [28.80464  ]
 [29.298803 ]
 [11.237837 ]
 [18.646544 ]
 [31.189451 ]
 [13.344664 ]
 [28.841742 ]
 [25.66281  ]
 [15.084761 ]
 [16.798368 ]
 [15.924551 ]
 [31.36112  ]
 [24.986364 ]]
10 Cost: 95.97634 
Prediction:
 [[157.11063]
 [183.99281]
 [184.06302]
 [196.52917]
 [142.46274]
 [ 98.59757]
 [142.77235]
 [ 99.8778 ]
 [166.13176]
 [144.8915 ]
 [140.14995]
 [134.61139]
 [194.05147]
 [166.64041]
 [142.28706]
 [183.43298]
 [161.97667]
 [169.89073]
 [186.37146]
 [166.61377]
 [168.04634]
 [169.14368]
 [161.75204]
 [167.48862]
 [193.25117]]
20 Cost: 94.25726 
Prediction:
 [[158.01503 ]
 [185.11975 ]
 [185.15112 ]
 [197.73497 ]
 [143.315   ]
 [ 99.26257 ]
 [143.69464 ]
 [100.587845]
 [167.21953 ]
 [145.9485  ]
 [141.02908 ]
 [135.50377 ]
 [195.15509 ]
 [167.5308  ]
 [143.2218  ]
 [184.59265 ]
 [162

550 Cost: 63.59115 
Prediction:
 [[156.68053]
 [185.34528]
 [184.34969]
 [197.84714]
 [143.12907]
 [100.85953]
 [144.7305 ]
 [102.63244]
 [169.05846]
 [149.99304]
 [141.56075]
 [137.21312]
 [193.34966]
 [164.74326]
 [144.64479]
 [185.59166]
 [159.52829]
 [172.44289]
 [185.25453]
 [165.59477]
 [170.25288]
 [171.27512]
 [163.4105 ]
 [164.72742]
 [193.88492]]
560 Cost: 63.13341 
Prediction:
 [[156.65852]
 [185.34845]
 [184.33618]
 [197.849  ]
 [143.12518]
 [100.88571]
 [144.74828]
 [102.66758]
 [169.08855]
 [150.06021]
 [141.56967]
 [137.24133]
 [193.31923]
 [164.69623]
 [144.66887]
 [185.60794]
 [159.47253]
 [172.4678 ]
 [185.2183 ]
 [165.56218]
 [170.27203]
 [171.29239]
 [163.42155]
 [164.66681]
 [193.87573]]
570 Cost: 62.67941 
Prediction:
 [[156.63663 ]
 [185.35161 ]
 [184.32275 ]
 [197.85088 ]
 [143.12128 ]
 [100.911766]
 [144.76598 ]
 [102.70262 ]
 [169.11852 ]
 [150.1271  ]
 [141.57854 ]
 [137.26944 ]
 [193.28894 ]
 [164.6494  ]
 [144.6929  ]
 [185.62418 ]
 [159.41704 ]
 [172.49268

1120 Cost: 42.71786 
Prediction:
 [[155.58963]
 [185.48203]
 [183.669  ]
 [197.94965]
 [142.89699]
 [102.15995]
 [145.65623]
 [104.46275]
 [170.55334]
 [153.37271]
 [142.01752]
 [138.62608]
 [191.80687]
 [162.35023]
 [145.88446]
 [186.40128]
 [156.67348]
 [173.75151]
 [183.42647]
 [163.9557 ]
 [171.24593]
 [172.13495]
 [163.99388]
 [161.67035]
 [193.40123]]
1130 Cost: 42.43235 
Prediction:
 [[155.57318]
 [185.4837 ]
 [183.65854]
 [197.95134]
 [142.89278]
 [102.17958]
 [145.671  ]
 [104.49193]
 [170.57588]
 [153.42448]
 [142.02464]
 [138.64761]
 [191.78299]
 [162.31306]
 [145.90395]
 [186.41347]
 [156.62878]
 [173.77257]
 [183.39833]
 [163.93059]
 [171.26163]
 [172.14793]
 [164.00331]
 [161.62335]
 [193.3934 ]]
1140 Cost: 42.14917 
Prediction:
 [[155.55682]
 [185.48535]
 [183.64812]
 [197.95303]
 [142.88857]
 [102.1991 ]
 [145.68573]
 [104.521  ]
 [170.59831]
 [153.47601]
 [142.03178]
 [138.66904]
 [191.75922]
 [162.27605]
 [145.92337]
 [186.42561]
 [156.58429]
 [173.79353]
 [183.3703 ]

1730 Cost: 28.95144 
Prediction:
 [[154.7295 ]
 [185.54855]
 [183.10936]
 [198.04893]
 [142.63635]
 [103.19021]
 [146.4762 ]
 [106.08044]
 [171.7369 ]
 [156.13432]
 [142.40742]
 [139.76791]
 [190.52136]
 [160.34087]
 [146.95099]
 [187.04298]
 [154.24014]
 [174.92848]
 [181.91873]
 [162.6163 ]
 [172.1131 ]
 [172.81924]
 [164.52522]
 [159.15509]
 [192.96297]]
1740 Cost: 28.777807 
Prediction:
 [[154.71756]
 [185.5491 ]
 [183.1014 ]
 [198.05048]
 [142.63203]
 [103.20453]
 [146.48836]
 [106.10441]
 [171.75334]
 [156.17346]
 [142.41309]
 [139.78398]
 [190.50293]
 [160.31194]
 [146.96655]
 [187.05188]
 [154.20479]
 [174.94608]
 [181.8972 ]
 [162.5973 ]
 [172.12587]
 [172.82878]
 [164.53322]
 [159.11926]
 [192.95638]]
1750 Cost: 28.605618 
Prediction:
 [[154.7057  ]
 [185.54965 ]
 [183.09348 ]
 [198.052   ]
 [142.62773 ]
 [103.21878 ]
 [146.50047 ]
 [106.128296]
 [171.76971 ]
 [156.21243 ]
 [142.41873 ]
 [139.79999 ]
 [190.48459 ]
 [160.28311 ]
 [146.98209 ]
 [187.06079 ]
 [154.1696  ]
 [174.

In [28]:
# Ask my score
print("Your score will be ", sess.run(hypothesis, feed_dict = {X: [[100, 70, 101]]}))
print("Other scores will be ", sess.run(hypothesis, feed_dict = {X: [[60, 70, 110], [90, 100, 80]]}))

Your score will be  [[181.73277]]
Other scores will be  [[145.86266]
 [187.2313 ]]


## Queue Runners

큰 데이터 활용 시, 메모리 많이 차지하지 않고 사용가능함

### tf.train.batch

In [30]:
# Collect batches sof csv in
train_x_batch, train_y_batch = \
    tf.train.batch([xy[0:-1], xy[-1:]],batch_size = 10)

In [32]:
sess = tf.Session()

...

# Start populating the filename queue
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess = sess, coord = coord)

for step in range(2001):
    x_batch, y_batch = sess.run([train_x_batch, train_y_batch])

...

coord.request_stop()
coord.join(threads)

### Full Code

In [34]:
import tensorflow as tf
filename_queue = tf.train.string_input_producer(["data-01-test-score.csv"], shuffle = False, name = "filename_queue")

In [36]:
# Define the Reader
reader = tf.TextLineReader()
key, value = reader.read(filename_queue)

In [37]:
# Defalut values, in case of empty columns
# Also specifies the type of the decoded result
record_defaults = [[0.], [0.], [0.], [0.]]
xy = tf.decode_csv(value, record_defaults = record_defaults)

In [38]:
# Collect batches of csv in
train_x_batch, train_y_batch = tf.train.batch([xy[0:-1], xy[-1:]], batch_size = 10)

In [39]:
# Placeholders for a tensor that will be always fed
X = tf.placeholder(tf.float32, shape = [None, 3])
Y = tf.placeholder(tf.float32, shape = [None, 1])

W = tf.Variable(tf.random_normal([3, 1]), name = "weight")
b = tf.Variable(tf.random_normal([1]), name = "bias")


# Hypothesis
hypothesis = tf.matmul(X, W) + b


# Simplified cost/loss function
cost = tf.reduce_mean(tf.square(hypothesis - Y))


# Minimize
optimizer = tf.train.GradientDescentOptimizer(learning_rate = 1e-5)
train = optimizer.minimize(cost)

In [40]:
# Launch the graph in a session
sess = tf.Session()


# Initialize global variables in the graph
sess.run(tf.global_variables_initializer())


# Start populating the filename queue
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess = sess, coord = coord)

for step in range(2001):
    x_batch, y_batch = sess.run([train_x_batch, train_y_batch])
    cost_val, hy_val, _ = sess.run([cost, hypothesis, train], feed_dict = {X: x_batch, Y: y_batch})
    
    if step % 10 == 0:
        print(step, "Cost:", cost_val, "\nPrediction:\n", hy_val)
        
coord.request_stop()
coord.join(threads)

0 Cost: 54948.03 
Prediction:
 [[-74.66413 ]
 [-92.95295 ]
 [-90.074326]
 [-94.96863 ]
 [-74.526306]
 [-50.992474]
 [-65.9166  ]
 [-41.425884]
 [-84.05379 ]
 [-71.56434 ]]
10 Cost: 16.982597 
Prediction:
 [[152.9562 ]
 [180.67465]
 [179.50992]
 [198.6322 ]
 [134.16888]
 [102.88058]
 [153.84485]
 [121.47781]
 [171.68588]
 [165.32416]]
20 Cost: 19.294968 
Prediction:
 [[154.49228]
 [182.5261 ]
 [181.33197]
 [200.61078]
 [135.58698]
 [103.91776]
 [155.31584]
 [122.5541 ]
 [173.41037]
 [166.90915]]
30 Cost: 19.252924 
Prediction:
 [[154.50267]
 [182.54352]
 [181.34706]
 [200.62143]
 [135.60626]
 [103.92359]
 [155.31377]
 [122.53841]
 [173.42068]
 [166.90631]]
40 Cost: 19.17847 
Prediction:
 [[154.50273 ]
 [182.54855 ]
 [181.34996 ]
 [200.61879 ]
 [135.6161  ]
 [103.922485]
 [155.30182 ]
 [122.5154  ]
 [173.4195  ]
 [166.89285 ]]
50 Cost: 19.104141 
Prediction:
 [[154.50272]
 [182.55348]
 [181.35274]
 [200.61606]
 [135.62584]
 [103.92135]
 [155.28978]
 [122.49241]
 [173.4182 ]
 [166.87935]]

490 Cost: 16.20536 
Prediction:
 [[154.49045 ]
 [182.76593 ]
 [181.46428 ]
 [200.50398 ]
 [136.03682 ]
 [103.88983 ]
 [154.79416 ]
 [121.542786]
 [173.3837  ]
 [166.35092 ]]
500 Cost: 16.14716 
Prediction:
 [[154.48993 ]
 [182.77065 ]
 [181.46657 ]
 [200.50162 ]
 [136.04575 ]
 [103.889496]
 [154.7836  ]
 [121.52255 ]
 [173.38336 ]
 [166.34027 ]]
510 Cost: 16.089207 
Prediction:
 [[154.4894  ]
 [182.77533 ]
 [181.46884 ]
 [200.4992  ]
 [136.05466 ]
 [103.88916 ]
 [154.77307 ]
 [121.502365]
 [173.38303 ]
 [166.32968 ]]
520 Cost: 16.031578 
Prediction:
 [[154.48886 ]
 [182.78001 ]
 [181.4711  ]
 [200.49683 ]
 [136.06355 ]
 [103.888855]
 [154.76257 ]
 [121.48223 ]
 [173.3827  ]
 [166.31912 ]]
530 Cost: 15.974286 
Prediction:
 [[154.48831]
 [182.7847 ]
 [181.47337]
 [200.49448]
 [136.07243]
 [103.88855]
 [154.7521 ]
 [121.46216]
 [173.38242]
 [166.30865]]
540 Cost: 15.917269 
Prediction:
 [[154.48775 ]
 [182.78937 ]
 [181.47562 ]
 [200.4921  ]
 [136.0813  ]
 [103.888275]
 [154.74167 ]
 [121

980 Cost: 13.684291 
Prediction:
 [[154.45503]
 [182.98915]
 [181.5654 ]
 [200.39452]
 [136.45404]
 [103.88897]
 [154.31042]
 [120.61475]
 [173.38527]
 [165.89017]]
990 Cost: 13.639247 
Prediction:
 [[154.45413]
 [182.99355]
 [181.56723]
 [200.39243]
 [136.46213]
 [103.88925]
 [154.30122]
 [120.5971 ]
 [173.38564]
 [165.88196]]
1000 Cost: 13.594464 
Prediction:
 [[154.45322 ]
 [182.99794 ]
 [181.56908 ]
 [200.39037 ]
 [136.4702  ]
 [103.889534]
 [154.29207 ]
 [120.57949 ]
 [173.38603 ]
 [165.8738  ]]
1010 Cost: 13.5499115 
Prediction:
 [[154.4523 ]
 [183.00233]
 [181.5709 ]
 [200.3883 ]
 [136.47827]
 [103.88985]
 [154.28293]
 [120.56195]
 [173.38644]
 [165.86569]]
1020 Cost: 13.505603 
Prediction:
 [[154.45137 ]
 [183.00673 ]
 [181.57274 ]
 [200.38625 ]
 [136.48631 ]
 [103.89017 ]
 [154.27382 ]
 [120.544464]
 [173.38686 ]
 [165.85762 ]]
1030 Cost: 13.461494 
Prediction:
 [[154.45045 ]
 [183.0111  ]
 [181.57455 ]
 [200.38419 ]
 [136.49434 ]
 [103.890495]
 [154.26472 ]
 [120.52701 ]
 [17

1600 Cost: 11.28656 
Prediction:
 [[154.38866]
 [183.24988]
 [181.66566]
 [200.27591]
 [136.9252 ]
 [103.92381]
 [153.78653]
 [119.60819]
 [173.42912]
 [165.45703]]
1610 Cost: 11.253726 
Prediction:
 [[154.38744 ]
 [183.25389 ]
 [181.66707 ]
 [200.27417 ]
 [136.93231 ]
 [103.924614]
 [153.77881 ]
 [119.593315]
 [173.43011 ]
 [165.45119 ]]
1620 Cost: 11.221072 
Prediction:
 [[154.38623]
 [183.25789]
 [181.66847]
 [200.27243]
 [136.93942]
 [103.92543]
 [153.7711 ]
 [119.57851]
 [173.43112]
 [165.44537]]
1630 Cost: 11.188554 
Prediction:
 [[154.385  ]
 [183.26186]
 [181.66985]
 [200.27066]
 [136.94649]
 [103.92625]
 [153.76341]
 [119.56373]
 [173.43211]
 [165.43958]]
1640 Cost: 11.156229 
Prediction:
 [[154.38379]
 [183.26587]
 [181.67123]
 [200.26894]
 [136.95355]
 [103.92708]
 [153.75575]
 [119.54898]
 [173.43314]
 [165.43384]]
1650 Cost: 11.124071 
Prediction:
 [[154.38257]
 [183.26984]
 [181.67262]
 [200.26721]
 [136.9606 ]
 [103.92791]
 [153.74811]
 [119.53429]
 [173.43416]
 [165.428

## shuffle batch

* `min_after_queue` defines how big a buffer we will randomly sample from -- bigger means better shuffling but slower start up and more memory used.
* capacity must be larger than `min_after_queue` and the amount larger determines the maximum we will prefetch.
* Recommendation: `min_after_queue` + (`num_threads` + a small safety margin) * `batch_size`

In [42]:
min_after_queue = 10000
capacity = min_after_queue + 3 * batch_size
example_batch, lable_batch = tf.train.shuffle_batch([example, label], batch_size = batch_size, capacity = capacity, min_after_dequeue = min_after_dequeue)

NameError: name 'batch_size' is not defined