In [1]:
!nvcc --help | grep sm_

        target. The options -dlto -arch=sm_NN will add a lto_NN target; if you want
        to only add a lto_NN target and not the compute_NN that -arch=sm_NN usually
        for '--gpu-architecture' may be a 'real' architecture (such as a sm_50),
        --gpu-architecture=sm_50' is equivalent to 'nvcc --gpu-architecture=compute_50
        --gpu-code=sm_50,compute_50'.
        -arch=all         build for all supported architectures (sm_*), and add PTX
        -arch=all-major   build for just supported major versions (sm_*0), plus the
        -arch=native      build for all architectures (sm_*) on the current system
        'native','sm_50','sm_52','sm_53','sm_60','sm_61','sm_62','sm_70','sm_72',
        'sm_75','sm_80','sm_86','sm_87','sm_89','sm_90','sm_90a'.
        (such as sm_50), and PTX code for the 'virtual' architecture (such as compute_50).
        For instance, '--gpu-architecture=compute_60' is not compatible with '--gpu-code=sm_52',
        features that are not present o

In [2]:
!nvidia-smi

Wed Jul  9 16:13:48 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   38C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [16]:
%%writefile radix_sort.cu
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <chrono>

using namespace std;
typedef unsigned int usi;
typedef unsigned char uch;

const int BITS=8;
const int RADIO=1<<BITS;

void printArray(usi* arr, int size){
  for(int i=0;i<size;i++) cout<<arr[i]<<" ";
  cout<<"\n";
}

__global__ void computarDigitos(usi* input, uch* digitos, int shift, int n){
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  if(i<n) digitos[i]=(input[i]>>shift)&0xFF;
}

__global__ void histogramaKernel(uch* digitos, int* histograma, int n){
  __shared__ int histLocal[RADIO];
  int tid=threadIdx.x;
  if(tid<RADIO) histLocal[tid]=0;
  __syncthreads();
  int i=blockIdx.x*blockDim.x+tid;
  if(i<n) atomicAdd(&histLocal[digitos[i]],1);
  __syncthreads();
  if(tid<RADIO) atomicAdd(&histograma[tid],histLocal[tid]);
}

__global__ void reordenarKernel(usi* input, uch* digitos, int* digit_offsets,
 int* counters, usi* output, int n){
  int i=blockIdx.x*blockDim.x+threadIdx.x;
  if(i<n){
    uch digito=digitos[i];
    int pos=atomicAdd(&counters[digito],1);
    output[digit_offsets[digito]+pos]=input[i];
  }
}

void radixSort(usi* d_input, usi* d_output, int n){
  uch* d_digitos;
  int* d_histograma;
  int* d_pos;
  int* d_counters;
  cudaMalloc(&d_digitos, n*sizeof(uch));
  cudaMalloc(&d_histograma, RADIO*sizeof(int));
  cudaMalloc(&d_pos, RADIO*sizeof(int));
  cudaMalloc(&d_counters, RADIO*sizeof(int));
  dim3 bloque(256);
  dim3 grid((n+bloque.x-1)/bloque.x);
  for(int shift=0;shift<32;shift+=BITS){
    computarDigitos<<<grid,bloque>>>(d_input,d_digitos,shift,n);
    cudaMemset(d_histograma,0,RADIO*sizeof(int));
    histogramaKernel<<<grid,bloque>>>(d_digitos,d_histograma,n);
    thrust::device_ptr<int> hist_ptr(d_histograma);
    thrust::device_ptr<int> pos_ptr(d_pos);
    thrust::exclusive_scan(hist_ptr,hist_ptr+RADIO,pos_ptr);
    cudaMemset(d_counters,0,RADIO*sizeof(int));
    reordenarKernel<<<grid,bloque>>>(d_input,d_digitos,d_pos,d_counters,d_output,n);
    swap(d_input,d_output);
  }
  cudaFree(d_digitos);
  cudaFree(d_histograma);
  cudaFree(d_pos);
  cudaFree(d_counters);
}

int main(int argc, char** argv){
  srand(time(NULL));
  int size=atoi(argv[1]);
  usi* arr=new usi[size];
  for(int i=0;i<size;i++) arr[i]=1+rand()%200;
  cout<<"Arreglo original: ";
  printArray(arr,size);
  usi *d_in,*d_out;
  cudaMalloc(&d_in,size*sizeof(usi));
  cudaMalloc(&d_out,size*sizeof(usi));
  cudaMemcpy(d_in,arr,size*sizeof(usi),cudaMemcpyHostToDevice);
  radixSort(d_in,d_out,size);
  cudaMemcpy(arr,d_in,size*sizeof(usi),cudaMemcpyDeviceToHost);
  cout<<"Arreglo ordenado: ";
  printArray(arr,size);
  cudaFree(d_in);
  cudaFree(d_out);
  delete[] arr;
  return 0;
}

Overwriting radix_sort.cu


In [17]:
!nvcc -arch=sm_75 radix_sort.cu -o radix_sort

In [18]:
!./radix_sort 10

Arreglo original: 22 21 181 97 195 157 76 69 65 168 
Arreglo ordenado: 21 22 65 69 76 97 157 168 181 195 
