In [None]:
!nvcc --help | grep sm_

        target. The options -dlto -arch=sm_NN will add a lto_NN target; if you want
        to only add a lto_NN target and not the compute_NN that -arch=sm_NN usually
        for '--gpu-architecture' may be a 'real' architecture (such as a sm_50),
        --gpu-architecture=sm_50' is equivalent to 'nvcc --gpu-architecture=compute_50
        --gpu-code=sm_50,compute_50'.
        -arch=all         build for all supported architectures (sm_*), and add PTX
        -arch=all-major   build for just supported major versions (sm_*0), plus the
        -arch=native      build for all architectures (sm_*) on the current system
        'native','sm_50','sm_52','sm_53','sm_60','sm_61','sm_62','sm_70','sm_72',
        'sm_75','sm_80','sm_86','sm_87','sm_89','sm_90','sm_90a'.
        (such as sm_50), and PTX code for the 'virtual' architecture (such as compute_50).
        For instance, '--gpu-architecture=compute_60' is not compatible with '--gpu-code=sm_52',
        features that are not present o

In [None]:
!nvidia-smi

Wed Jul  9 20:42:26 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   41C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
%%writefile quicksort.cu
#include <iostream>
#include <stack>
#include <vector>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/scan.h>
#include <thrust/functional.h>
#include <thrust/execution_policy.h>

using namespace std;

struct Rango{
  int inicio,fin;
};

//Kernel para marcar los elementos segun el pivote
struct menorQue{
  int pivote;
  __host__ __device__ menorQue(int p):pivote(p){}
  __host__ __device__ bool operator()(int x) const{
    return (x<pivote);
  }
};
struct igualA{
  int pivote;
  __host__ __device__ igualA(int p):pivote(p){}
  __host__ __device__ bool operator()(int x) const{
    return (x==pivote);
  }
};
struct mayorQue{
  __host__ __device__
  int operator()(int esMenor, int esIgual) const{
    return !(esMenor||esIgual);
  }
};

void printArray(vector<int> vec){
  for(int i=0;i<vec.size();i++) cout<<vec[i]<<" ";
  cout<<"\n";
}

void quicksort(thrust::device_vector<int> &d_arr){
  int arr_size=d_arr.size();
  thrust::device_vector<int> d_buffer(arr_size);
  stack<Rango> rg_stack;
  rg_stack.push({0,arr_size-1});
  while(!rg_stack.empty()){
    Rango rg=rg_stack.top();
    rg_stack.pop();
    int rg_size=rg.fin-rg.inicio+1;
    if(rg_size<=1) continue;
    int pivote=d_arr[rg.fin];
    thrust::device_vector<int> lessFlags(rg_size),equalFlags(rg_size),greaterFlags(rg_size);
    thrust::device_vector<int> lessPos(rg_size),equalPos(rg_size),greaterPos(rg_size);
    auto begin=d_arr.begin()+rg.inicio;
    auto finish=d_arr.begin()+rg.fin+1;
    thrust::transform(begin,finish,lessFlags.begin(),menorQue(pivote));
    thrust::transform(begin,finish,equalFlags.begin(),igualA(pivote));
    thrust::transform(
      lessFlags.begin(),
      lessFlags.end(),
      equalFlags.begin(),
      greaterFlags.begin(),
      mayorQue()
    );
    thrust::exclusive_scan(lessFlags.begin(),lessFlags.end(),lessPos.begin());
    thrust::exclusive_scan(equalFlags.begin(),equalFlags.end(),equalPos.begin());
    thrust::exclusive_scan(greaterFlags.begin(),greaterFlags.end(),greaterPos.begin());
    int numMenor=thrust::reduce(lessFlags.begin(),lessFlags.end());
    int numIgual=thrust::reduce(equalFlags.begin(),equalFlags.end());
    for(int i=0;i<rg_size;i++){
      int val=d_arr[rg.inicio+i];
      if(lessFlags[i]){
        int indice=rg.inicio+lessPos[i];
        d_buffer[indice]=val;
      }else if(equalFlags[i]){
        int indice=rg.inicio+numMenor+equalPos[i];
        d_buffer[indice]=val;
      }else{
        int indice=rg.inicio+numMenor+numIgual+greaterPos[i];
        d_buffer[indice]=val;
      }
    }
    thrust::copy(
      d_buffer.begin()+rg.inicio,
      d_buffer.begin()+rg.fin+1,
      d_arr.begin()+rg.inicio
    );
    if(numMenor>1) rg_stack.push({rg.inicio,rg.inicio+numMenor-1});
    if(rg_size-numMenor-numIgual>1) rg_stack.push({rg.inicio+numMenor+numIgual,rg.fin});
  }
}

int main(int argc, char** argv){
  srand(time(NULL));
  int size=atoi(argv[1]);
  vector<int> arr;
  for(int i=0;i<size;i++) arr.push_back(1+rand()%200);
  auto start=chrono::high_resolution_clock::now();
  thrust::device_vector<int> d_arr(arr.begin(),arr.end());
  quicksort(d_arr);
  thrust::copy(d_arr.begin(),d_arr.end(),arr.begin());
  auto finish=chrono::high_resolution_clock::now();
  auto duration=chrono::duration_cast<chrono::nanoseconds>(finish - start).count();
  cout<<"Tiempo demorado en ordenar arreglo: "<<duration/1000000000.0<<" s\n";
  return 0;
}

Writing quicksort.cu


In [None]:
!nvcc -arch=sm_75 quicksort.cu -o quicksort

In [None]:
!./quicksort 2048

Tiempo demorado en ordenar arreglo: 0.782673 s
