/
Minimal.xs
180 lines (157 loc) · 5.08 KB
/
Minimal.xs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#include "ppport.h"
MODULE = CUDA::Minimal PACKAGE = CUDA::Minimal
void
_free(SV * dev_ptr_SV)
CODE:
// Only free the memory if the pointer is not null:
if (SvIV(dev_ptr_SV) != 0) {
// Cast the SV to a device pointer:
void * dev_ptr = INT2PTR(void*, SvIV(dev_ptr_SV));
// Free the memory:
cudaError_t err = cudaFree(dev_ptr);
// Croak on failure.
if (err != cudaSuccess)
Perl_croak(aTHX_ "Unable to free memory on the device: %s"
, cudaGetErrorString(err));
// Set SV to have a value of zero to prevent accidental double frees:
sv_setiv(dev_ptr_SV, 0);
}
SV *
_malloc(SV * data_SV)
CODE:
// First thing's first: guard against calls in void context:
if (GIMME_V == G_VOID)
Perl_croak(aTHX_ "Cannot call Malloc in void context");
void * dev_ptr = 0;
size_t data_len = 0;
// Check the input arguments:
if (SvTYPE(data_SV) == SVt_PV) {
// If the host scalar is a PV, use its length:
data_len = (size_t)SvCUR(data_SV);
}
else {
// Otherwise interpret the scalar as an integer
// and use it as the length:
data_len = (size_t)SvIV(data_SV);
}
// Allocate the memory:
cudaError_t err = cudaMalloc(&dev_ptr, data_len);
// Check for errors:
if (err != cudaSuccess)
Perl_croak(aTHX_ "Unable to allocate %lu bytes on the device: %s"
, (long unsigned)data_len, cudaGetErrorString(err));
// Set the return:
RETVAL = newSViv(PTR2IV(dev_ptr));
OUTPUT:
RETVAL
void
_transfer(SV * src_SV, SV * dst_SV, ...)
PROTOTYPE: $$;$$
CODE:
void * dst_ptr = 0;
void * src_ptr = 0;
size_t length = 0;
size_t host_offset = 0;
size_t host_length = 0;
enum cudaMemcpyKind kind;
// Get the specified length and host offset if they passed it in:
if (items > 2) length = (size_t)SvIV(ST(2));
if (items > 3) host_offset = (size_t)SvIV(ST(2));
// Determine if either of the two SVs are the host memory:
if (SvTYPE(dst_SV) == SVt_PV && SvTYPE(src_SV) == SVt_PV) {
// We can't have both of them looking like host memory:
Perl_croak(aTHX_ "Transfer requires one or more of %s\n%s"
, "the arguments to be a device pointer"
, "but it looks like both are host arrays");
}
else if (SvTYPE(dst_SV) == SVt_PV) {
// Looks like the destination is host memory.
kind = cudaMemcpyDeviceToHost;
host_length = (size_t)SvCUR(dst_SV) - host_offset;
src_ptr = INT2PTR(void*, SvIV(src_SV));
dst_ptr = SvPVX(dst_SV) + host_offset;
// Make sure the offset is shorter than the host length:
if (host_length <= 0)
Perl_croak(aTHX_ "Host offset must be less than the host's length");
}
else if (SvTYPE(src_SV) == SVt_PV) {
// Looks like the source is host memory.
kind = cudaMemcpyHostToDevice;
host_length = (size_t)SvCUR(src_SV) - host_offset;
src_ptr = SvPVX(src_SV) + host_offset;
dst_ptr = INT2PTR(void*, SvIV(dst_SV));
// Make sure the offset is shorter than the host length:
if (host_length <= 0)
Perl_croak(aTHX_ "Host offset must be less than the host's length");
}
else {
// Looks like both the source and destination are device pointers.
kind = cudaMemcpyDeviceToDevice;
src_ptr = INT2PTR(void*, SvIV(src_SV));
dst_ptr = INT2PTR(void*, SvIV(dst_SV));
if (host_offset > 0) {
Perl_croak(aTHX_ "Host offsets are not allowed for %s"
, "device-to-device transfers");
}
}
// Make sure that they provided a length of some sort
if (length == 0 && host_length == 0)
Perl_croak(aTHX_ "You must provide the number of bytes %s"
, "for device-to-device transfers");
// Make sure the requested length does not exceed the host's length
if (host_length > 0 && length > host_length)
Perl_croak(aTHX_ "Attempting to transfer more data %s"
, "than the host can accomodate");
// Use the host length if no length was explicitly given:
if (length == 0) length = host_length;
// Perform the copy and check for errors:
cudaError_t err = cudaMemcpy(dst_ptr, src_ptr, length, kind);
if (err != cudaSuccess)
Perl_croak(aTHX_ "Unable to copy memory: %s"
, cudaGetErrorString(err));
void
ThreadSynchronize()
CODE:
cudaThreadSynchronize();
SV *
GetLastError()
CODE:
cudaError_t err = cudaGetLastError();
RETVAL = newSVpv(cudaGetErrorString(err), 0);
OUTPUT:
RETVAL
SV *
PeekAtLastError()
CODE:
cudaError_t err = cudaPeekAtLastError();
RETVAL = newSVpv(cudaGetErrorString(err), 0);
OUTPUT:
RETVAL
=pod
// Thanks to Kartik for the compiler-directive work-around code. I am removing
// the DeviceReset bindings for now because they are only in the latest toolkit
// (as of July 2011), and not appropriate for this module. However, conditional
// bindings like these should show up in the driver wrapper, whenver that
// appears.
/*
#include <cuda.h>
#ifndef CUDA_VERSION
#define CUDA_VERSION 0
#endif
SV *
DeviceReset()
CODE:
//CUDA greater then version 4.1 needed
#if (CUDA_VERSION > 4010 )
cudaError_t err = cudaDeviceReset();
RETVAL = newSVpv(cudaGetErrorString(err), 0);
#else
RETVAL = newSVpv("Version too low for cudaDeviceReset", 0 );
#endif
OUTPUT:
RETVAL
*/
=cut