forked from sjuvekar/Theano
-
Notifications
You must be signed in to change notification settings - Fork 0
/
logistic_regression_gpu.py
181 lines (153 loc) · 9.42 KB
/
logistic_regression_gpu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/usr/bin/env python
# Theano tutorial
# Solution to Exercise in section 'Using the GPU'
# 1. Raw results
import numpy
import theano
import theano.tensor as tt
from theano import sandbox, Out
theano.config.floatX = 'float32'
rng = numpy.random
N = 400
feats = 784
D = (rng.randn(N, feats).astype(theano.config.floatX),
rng.randint(size=N, low=0, high=2).astype(theano.config.floatX))
training_steps = 10000
# Declare Theano symbolic variables
x = theano.shared(D[0], name="x")
y = theano.shared(D[1], name="y")
w = theano.shared(rng.randn(feats).astype(theano.config.floatX), name="w")
b = theano.shared(numpy.asarray(0., dtype=theano.config.floatX), name="b")
x.tag.test_value = D[0]
y.tag.test_value = D[1]
#print "Initial model:"
#print w.get_value(), b.get_value()
# Construct Theano expression graph
p_1 = 1 / (1 + tt.exp(-tt.dot(x, w) - b)) # Probability of having a one
prediction = p_1 > 0.5 # The prediction that is done: 0 or 1
xent = -y * tt.log(p_1) - (1 - y) * tt.log(1 - p_1) # Cross-entropy
cost = tt.cast(xent.mean(), 'float32') + \
0.01 * (w ** 2).sum() # The cost to optimize
gw, gb = tt.grad(cost, [w, b])
"""
# Compile expressions to functions
train = theano.function(
inputs=[x, y],
outputs=[Out(theano.sandbox.cuda.basic_ops.gpu_from_host(tt.cast(prediction, 'float32')),borrow=True), Out(theano.sandbox.cuda.basic_ops.gpu_from_host(tt.cast(xent, 'float32')), borrow=True)],
updates={w: w - 0.01 * gw, b: b - 0.01 * gb},
name="train")
predict = theano.function(inputs=[x], outputs=Out(theano.sandbox.cuda.basic_ops.gpu_from_host(tt.cast(prediction, 'float32')), borrow=True),
name="predict")
"""
# Compile expressions to functions
train = theano.function(
inputs=[],
outputs=[prediction, xent],
updates={w: w - 0.01 * gw, b: b - 0.01 * gb},
name="train")
predict = theano.function(inputs=[], outputs=prediction,
name="predict")
if any([x.op.__class__.__name__ in ['Gemv', 'CGemv', 'Gemm', 'CGemm'] for x in
train.maker.fgraph.toposort()]):
print 'Used the cpu'
elif any([x.op.__class__.__name__ in ['GpuGemm', 'GpuGemv'] for x in
train.maker.fgraph.toposort()]):
print 'Used the gpu'
else:
print 'ERROR, not able to tell if theano used the cpu or the gpu'
print train.maker.fgraph.toposort()
for i in range(training_steps):
pred, err = train()
#print "Final model:"
#print w.get_value(), b.get_value()
print "target values for D"
print D[1]
print "prediction on D"
print predict()
"""
# 2. Profiling
# 2.1 Profiling for CPU computations
# In your terminal, type:
$ THEANO_FLAGS=profile=True,device=cpu python using_gpu_solution_1.py
# You'll see first the output of the script:
Used the cpu
target values for D
prediction on D
# Followed by the output of profiling.. You'll see profiling results for each function
# in the script, followed by a summary for all functions.
# We'll show here only the summary:
Results were produced using an Intel(R) Core(TM) i7-4820K CPU @ 3.70GHz
Function profiling
==================
Message: Sum of all(3) printed profiles at exit excluding Scan op profile.
Time in 10002 calls to Function.__call__: 1.590916e+00s
Time in Function.fn.__call__: 1.492365e+00s (93.805%)
Time in thunks: 1.408159e+00s (88.512%)
Total compile time: 6.309664e+00s
Number of Apply nodes: 25
Theano Optimizer time: 4.848340e-01s
Theano validate time: 5.454302e-03s
Theano Linker time (includes C, CUDA code generation/compiling): 5.691789e+00s
Class
---
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
59.6% 59.6% 0.839s 4.19e-05s C 20001 3 theano.tensor.blas_c.CGemv
30.1% 89.7% 0.424s 4.71e-06s C 90001 10 theano.tensor.elemwise.Elemwise
5.5% 95.2% 0.078s 7.79e-02s Py 1 1 theano.tensor.blas.Gemv
1.9% 97.1% 0.026s 1.30e-06s C 20001 3 theano.tensor.basic.Alloc
1.3% 98.4% 0.018s 1.85e-06s C 10000 1 theano.tensor.elemwise.Sum
1.0% 99.4% 0.014s 4.78e-07s C 30001 4 theano.tensor.elemwise.DimShuffle
0.6% 100.0% 0.008s 4.23e-07s C 20001 3 theano.compile.ops.Shape_i
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime)
Ops
---
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
59.6% 59.6% 0.839s 4.19e-05s C 20001 3 CGemv{inplace}
15.8% 75.4% 0.223s 2.23e-05s C 10000 1 Elemwise{Composite{[sub(mul(i0, scalar_softplus(i1)), mul(i2, i3, scalar_softplus(i4)))]}}[(0, 4)]
7.7% 83.1% 0.109s 1.09e-05s C 10000 1 Elemwise{Composite{[add(mul(scalar_sigmoid(i0), i1, i2, i3), true_div(mul(scalar_sigmoid(neg(i0)), i4), i5))]}}[(0, 0)]
5.5% 88.7% 0.078s 7.79e-02s Py 1 1 Gemv{no_inplace}
4.3% 92.9% 0.060s 6.00e-06s C 10000 1 Elemwise{Composite{[GT(scalar_sigmoid(i0), i1)]}}
1.9% 94.8% 0.026s 1.30e-06s C 20001 3 Alloc
1.3% 96.1% 0.018s 1.85e-06s C 10000 1 Sum{acc_dtype=float64}
0.7% 96.8% 0.009s 4.73e-07s C 20001 3 InplaceDimShuffle{x}
0.6% 97.4% 0.009s 8.52e-07s C 10000 1 Elemwise{sub,no_inplace}
0.6% 98.0% 0.008s 4.23e-07s C 20001 3 Shape_i{0}
0.5% 98.5% 0.007s 7.06e-07s C 10000 1 Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)]
0.5% 98.9% 0.007s 6.57e-07s C 10000 1 Elemwise{neg,no_inplace}
0.3% 99.3% 0.005s 4.88e-07s C 10000 1 InplaceDimShuffle{1,0}
0.3% 99.5% 0.004s 3.78e-07s C 10000 1 Elemwise{inv,no_inplace}
0.2% 99.8% 0.003s 3.44e-07s C 10000 1 Elemwise{Cast{float32}}
0.2% 100.0% 0.003s 3.01e-07s C 10000 1 Elemwise{Composite{[sub(i0, mul(i1, i2))]}}[(0, 0)]
0.0% 100.0% 0.000s 8.11e-06[sub(neg(i0), i1)]}}[(0, 0)].0, TensorConstant{(1,) of -1.0}, Elemwise{sub,no_inplace}.0, Elemwise{neg,no_inplace}.0)
7.7% 83.1% 0.109s 1.09e-05s 10000 15 Elemwise{Composite{[add(mul(scalar_sigmoid(i0), i1, i2, i3), true_div(mul(scalar_sigmoid(neg(i0)), i4), i5))]}}[(0, 0)](Elemwise{Com(TensorConstant{0.0}, Shape_i{0}.0)
0.9% 96.1% 0.013s 1.27e-06s 10000 12 Alloc(Elemwise{inv,no_inplace}.0, Shape_i{0}.0)
0.6% 96.7% 0.009s 8.52e-07s 10000 4 Elemwise{sub,no_inplace}(TensorConstant{(1,) of 1.0}, y)
0.5% 97.2% 0.007s 7.06e-07s 10000 9 Elemwise{Composite{[sub(neg(i0), i1)]}}[(0, 0)](CGemv{inplace}.0, InplaceDimShuffle{x}.0)
0.5% 97.6% 0.007s 6.57e-07s 10000 11 Elemwise{neg,no_inplace}.0)
0.2% 99.6% 0.003s 3.19e-07s 10000 6 InplaceDimShuffle{x}(Shape_i{0}.0)
0.2% 99.8% 0.003s 3.01e-07s 10000 18 Elemwise{Composite{[sub(i0, mul(i1, i2))]}}[(0, 0)](b, TensorConstant{0.00999999977648}, Sum{acc_dtype=float64}.0)
0.2% 100.0% 0.003s 2.56e-02.865905e+00s (81.067%)
Total compile time: 4.728150e-01s
Number of Apply nodes: 36
Theano Optimizer time: 4.283385e-01s
Theano validate time: 7.687330e-03s
Theano Linker time (includes C, CUDA code generation/compiling): 2.801418e-02s
Class
---
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
45.7% 45.7% 1.308s 1.64e-05s C 80001 98.6% 0.025s 8.23e-07s C 30001 7.2% 47.9% 0.207s 2.07e-05s C 10000 1 GpuCAReduce{add}{1}
7.1% 55.0% 0.205s 2.05e-05s C 10000 1 GpuAlloc
6.9% 62.0% 0.198s 1.98e-05s C 10000 1 GpuElemwise{sub,no_inplace}
6.9% 68.9% 0.198s 1.98e-05s C 10000 1 GpuElemwise{inv,no_inplace}
6.2% 75.1% 0.178s 1.78e-05s C 10000 1 GpuElemwise{neg,no_inplace}
5.6% 80.6% 0.159s 1.59e-05s C 10000 1 GpuElemwi 10000 1 GpuElemwise{Composite{[sub(i0, mul(i1, i2))]}}[(0, 0)]
0.5% 98.2% 0.014s 7.18e-07s C 20001 3 Shape_i{0}
0.5% 98.7% 0.013s 1.33e-06s C 10001 2 Elemwise{gt,no_inplace}
0.3% 99.0% 0.010s 9.81e-07s C 10000 1 GpuDimShuffle{1,0}
0.3% 99.3% 0.008s s sorConstant{0.999800026417})
8.4% 17.2% 0.241s 2.41e-05s 10000 7 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, x, w, TensorConstant{0.0nt{[ 1.]}, y)
6.9% 60.7% 0.198s 1.98e-05s 10000 12 GpuElemwise{inv,no_inplace}(GpuFrom000 15 GpuElemwise{ScalarSigmoid}[(0, 0)](GpuElemwise{neg,no_inplace}.0)
4.2% 94.4% 0.119s 1.19e-05s 10000 23 GpuElemwise{Composite{[sub(i0, mul(i1, i2))]}}[(0, 0)](b, CudaNdarrayConstant{0.00999999977648}, GpuCAReduce{add}{1}.0)
3.4% 97.7% 0.096s 9.61e-06s 10000 16 HostFromGpu(GpuElemwise{Composite{[sub(mul(i0, scalar_softplus(i1)nd 'HostFromGpu' by themselves
consume a large amount of extra time, but by making
"""