In [None]:
!nvcc --help | grep sm_

        target. The options -dlto -arch=sm_NN will add a lto_NN target; if you want
        to only add a lto_NN target and not the compute_NN that -arch=sm_NN usually
        for '--gpu-architecture' may be a 'real' architecture (such as a sm_50),
        --gpu-architecture=sm_50' is equivalent to 'nvcc --gpu-architecture=compute_50
        --gpu-code=sm_50,compute_50'.
        -arch=all         build for all supported architectures (sm_*), and add PTX
        -arch=all-major   build for just supported major versions (sm_*0), plus the
        -arch=native      build for all architectures (sm_*) on the current system
        'native','sm_50','sm_52','sm_53','sm_60','sm_61','sm_62','sm_70','sm_72',
        'sm_75','sm_80','sm_86','sm_87','sm_89','sm_90','sm_90a'.
        (such as sm_50), and PTX code for the 'virtual' architecture (such as compute_50).
        For instance, '--gpu-architecture=compute_60' is not compatible with '--gpu-code=sm_52',
        features that are not present o

In [None]:
!nvidia-smi

Tue Jul  8 16:24:56 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   51C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
%%writefile merge_sort.cu
#include <iostream>
#include <chrono>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

using namespace std;

void printArray(int* arr, int size){
  for(int i=0;i<size;i++) cout<<arr[i]<<" ";
  cout<<"\n";
}

__global__ void merge(int* in, int* out, int width, int size){
  int indice=blockIdx.x*blockDim.x+threadIdx.x;
  int inicio=2*indice*width;
  if(inicio>=size) return;
  int mid=min(inicio+width,size),fin=min(inicio+2*width,size);
  int i=inicio,j=mid,k=inicio;
  while(i<mid && j<fin){
    if(in[i]<=in[j]){
      out[k]=in[i];
      i++;
    }else{
      out[k]=in[j];
      j++;
    }
    k++;
  }
  while(i<mid){
    out[k]=in[i];
    i++;
    k++;
  }
  while(j<fin){
    out[k]=in[j];
    j++;
    k++;
  }
}

void mergeSort(int* arr, int size){
  int *d_in,*d_out;
  cudaMalloc(&d_in,size*sizeof(int));
  cudaMalloc(&d_out,size*sizeof(int));
  cudaMemcpy(d_in,arr,size*sizeof(int),cudaMemcpyHostToDevice);
  int hebras=256;
  for(int width=1;width<size;width*=2){
    int bloques=(size+2*width*hebras-1)/(2*width*hebras);
    merge<<<bloques,hebras>>>(d_in,d_out,width,size);
    cudaDeviceSynchronize();
    swap(d_in,d_out);
  }
  cudaMemcpy(arr,d_in,size*sizeof(int),cudaMemcpyDeviceToHost);
  cudaFree(d_in);
  cudaFree(d_out);
}

int main(int argc, char** argv){
  srand(time(NULL));
  int size=atoi(argv[1]);
  int* arr=new int[size];
  for(int i=0;i<size;i++) arr[i]=1+rand()%200;
  auto start=chrono::high_resolution_clock::now();
  mergeSort(arr,size);
  auto finish=chrono::high_resolution_clock::now();
  auto duration=chrono::duration_cast<chrono::nanoseconds>(finish - start).count();
  cout<<"Tiempo demorado en ordenar arreglo: "<<duration/1000000000.0<<" s\n";
  delete[] arr;
  return 0;
}

Overwriting merge_sort.cu


In [None]:
!nvcc -arch=sm_75 merge_sort.cu -o merge_sort

In [None]:
!./merge_sort 64

Tiempo demorado en ordenar arreglo: 0.106912 s
