In [None]:
!nvcc --help | grep sm_

        target. The options -dlto -arch=sm_NN will add a lto_NN target; if you want
        to only add a lto_NN target and not the compute_NN that -arch=sm_NN usually
        for '--gpu-architecture' may be a 'real' architecture (such as a sm_50),
        --gpu-architecture=sm_50' is equivalent to 'nvcc --gpu-architecture=compute_50
        --gpu-code=sm_50,compute_50'.
        -arch=all         build for all supported architectures (sm_*), and add PTX
        -arch=all-major   build for just supported major versions (sm_*0), plus the
        -arch=native      build for all architectures (sm_*) on the current system
        'native','sm_50','sm_52','sm_53','sm_60','sm_61','sm_62','sm_70','sm_72',
        'sm_75','sm_80','sm_86','sm_87','sm_89','sm_90','sm_90a'.
        (such as sm_50), and PTX code for the 'virtual' architecture (such as compute_50).
        For instance, '--gpu-architecture=compute_60' is not compatible with '--gpu-code=sm_52',
        features that are not present o

In [None]:
!nvidia-smi

Tue Jul  8 22:59:47 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   63C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
%%writefile bitonic_sort.cu
#include <iostream>
#include <chrono>
#include <climits>
#include <vector>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

using namespace std;

void printArray(int* arr, int size){
  for(int i=0;i<size;i++) cout<<arr[i]<<" ";
  cout<<"\n";
}

// Kernel de Bitonic Sort
__global__ void bitonic_sort_kernel(int* data, int j, int k, int n) {
    unsigned int i = threadIdx.x + blockDim.x * blockIdx.x;
    if (i >= n) return;

    unsigned int ixj = i ^ j;
    if (ixj > i && ixj < n) {
        bool ascendente = ((i & k) == 0);
        if ((data[i] > data[ixj]) == ascendente) {
            int temp = data[i];
            data[i] = data[ixj];
            data[ixj] = temp;
        }
    }
}

// Función para redondear al siguiente número potencia de 2
int sigtePotenciaDos(int n) {
    int ret = 1;
    while (ret < n) ret <<= 1;
    return ret;
}

void bitonic_sort(int* h_data, int n) {
    int padded_n = sigtePotenciaDos(n);//
    int* h_padded = new int[padded_n];
    for (int i = 0; i < n; i++) h_padded[i] = h_data[i];
    for (int i = n; i < padded_n; i++) h_padded[i] = INT_MAX;

    int* d_data;
    cudaMalloc(&d_data, padded_n * sizeof(int));
    cudaMemcpy(d_data, h_padded, padded_n * sizeof(int), cudaMemcpyHostToDevice);

    int hebras = 512;
    int numBloques = (padded_n + hebras - 1) / hebras;

    for (int k = 2; k <= padded_n; k <<= 1) {
        for (int j = k >> 1; j > 0; j >>= 1) {
            bitonic_sort_kernel<<<numBloques,hebras>>>(d_data, j, k, padded_n);
            cudaDeviceSynchronize();
        }
    }

    cudaMemcpy(h_padded, d_data, padded_n * sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(d_data);

    // Copiar sólo los n elementos ordenados reales
    for (int i = 0; i < n; i++) {
        h_data[i] = h_padded[i];
    }
    delete[] h_padded;
}

int main(int argc, char** argv){
  srand(time(NULL));
  int size=atoi(argv[1]);
  int* arr=new int[size];
  for(int i=0;i<size;i++) arr[i]=1+rand()%200;
  auto start=chrono::high_resolution_clock::now();
  bitonic_sort(arr,size);
  auto finish=chrono::high_resolution_clock::now();
  auto duration=chrono::duration_cast<chrono::nanoseconds>(finish - start).count();
  cout<<"Tiempo demorado en ordenar arreglo: "<<duration/1000000000.0<<" s\n";
  return 0;
}

Overwriting bitonic_sort.cu


In [None]:
!nvcc -arch=sm_75 bitonic_sort.cu -o bitonic_sort

In [None]:
!./bitonic_sort 16

Arreglo original: 186 119 44 30 11 140 138 44 77 43 17 63 117 167 155 16 
Arreglo ordenado: 11 16 17 30 43 44 44 63 77 117 119 138 140 155 167 186 
