                                                                        Question 1

!
Runs a single shell command.

Example
!nvidia-smi
!ls
!pip install numpy

%
Runs a line magic. Single line only.

Common ones
%time x = sum(a)
%pwd
%cd /content

%%
Runs a cell magic. Applies to the whole cell.

Common ones
%%time
%%bash
%%writefile test.cu

Example
%%bash
nvcc hello.cu -o hello
./hello

                                                                            Question 2

In [3]:
!nvidia-smi

Mon Feb  2 10:58:33 2026       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.08             Driver Version: 535.161.08   CUDA Version: 12.6     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA H100 80GB HBM3          On  | 00000000:D1:00.0 Off |                   On |
| N/A   25C    P0              69W / 700W |                  N/A |     N/A      Default |
|                                         |                      |              Enabled |
+-----------------------------------------+----------------------+----------------------+

+------------------------------------------------------------------

In [4]:
!nvidia-smi --query-gpu=utilization.gpu --format=csv

utilization.gpu [%]
[N/A]


                                                                            Question 3

Zero output from kernel
Causes
Kernel not launched.
<<< >>> configuration wrong.
printf buffer not flushed.

Fix
Call cudaDeviceSynchronize after kernel.
Check grid and block size.

Incorrect indexing
Cause
Wrong global thread ID math.

Fix
Always compute
int id = blockIdx.x * blockDim.x + threadIdx.x;
Guard with
if (id < N)

PTX or invalid device function errors
Cause
GPU architecture mismatch.
Wrong compute capability.

Fix
Compile with correct arch. Example
nvcc -arch=sm_75 file.cu

                                                                            Question 4

In [33]:
%%writefile hello_threads.cu

#include <stdio.h>
#include <cuda_runtime.h>

__global__ void mykernel(void)
{
int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
printf("Hello from GPU thread %d\n", global_thread_id);
}

int main()
{
mykernel<<<1, 8>>>();
cudaDeviceSynchronize();
return 0;
}

Overwriting hello_threads.cu


In [34]:
!nvcc hello_threads.cu -o hello_threads

In [35]:
!./hello_threads

Hello from GPU thread 0
Hello from GPU thread 1
Hello from GPU thread 2
Hello from GPU thread 3
Hello from GPU thread 4
Hello from GPU thread 5
Hello from GPU thread 6
Hello from GPU thread 7


                                                                            Question 5

In [41]:
%%writefile memory.cu
#include <stdio.h>
#include <cuda_runtime.h>

// Device kernel
__global__ void printDeviceData(int *d_arr)
{
    int id = threadIdx.x;
    printf("GPU thread %d sees value %d\n", id, d_arr[id]);
}

int main()
{
    int h_arr[5] = {10, 20, 30, 40, 50};

    int *d_arr;
    cudaMalloc((void**)&d_arr, 5 * sizeof(int));

    cudaMemcpy(d_arr, h_arr, 5 * sizeof(int), cudaMemcpyHostToDevice);

    printDeviceData<<<1, 5>>>(d_arr);
    cudaDeviceSynchronize();

    cudaMemcpy(h_arr, d_arr, 5 * sizeof(int), cudaMemcpyDeviceToHost);

    printf("CPU sees:\n");
    for (int i = 0; i < 5; i++)
    {
        printf("%d ", h_arr[i]);
    }
    printf("\n");

    cudaFree(d_arr);
    return 0;
}


Overwriting memory.cu


In [42]:
!nvcc memory.cu -o memory

In [43]:
!./memory

GPU thread 0 sees value 10
GPU thread 1 sees value 20
GPU thread 2 sees value 30
GPU thread 3 sees value 40
GPU thread 4 sees value 50
CPU sees:
10 20 30 40 50 


                                                                                Question 6

In [44]:
import time
import numpy as np

N = 10_000_000

lst = list(range(N))
tup = tuple(range(N))
arr = np.arange(N)

start = time.time()
s = 0
for x in lst:
    s += x
print("List time:", time.time() - start)

start = time.time()
s = 0
for x in tup:
    s += x
print("Tuple time:", time.time() - start)

start = time.time()
s = arr.sum()
print("NumPy time:", time.time() - start)


List time: 0.46729469299316406
Tuple time: 0.4618096351623535
NumPy time: 0.0069561004638671875
