In [None]:
!nvidia-smi

#sudo apt update
#sudo apt install openmpi-bin openmpi-common libopenmpi-dev

Sat Feb  7 16:48:49 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   51C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [38]:
%%writefile multiplication.cu

#include <bits/stdc++.h>
#include <cuda_runtime.h>
#include <time.h>
using namespace std;

__global__ void multiplication(int *a, int *b, int *c, int n){
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if(row < n && col < n){
        int sum = 0;
        for(int j = 0; j < n; j++){
            sum += a[row * n + j] * b[j * n + col];
        }
        c[row * n + col] = sum;
    }
}


int main(){
  int n = 40;
  size_t bytes = n * n * sizeof(int);
  int a[n][n], b[n][n], c[n][n];

  srand(time(0));
  for(int i = 0; i < n; i++){
    for(int j = 0; j < n; j++){
      a[i][j] = 1;
      b[i][j] = 1;
    }
  }

  int *d_a, *d_b, *d_c;
  cudaMalloc(&d_a, bytes);
  cudaMalloc(&d_b, bytes);
  cudaMalloc(&d_c, bytes);

  cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, bytes, cudaMemcpyHostToDevice);

  dim3 threads(4,16);
  dim3 blocks((n + threads.x - 1)/threads.x,(n + threads.y - 1)/threads.y);

  // Event for timing
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  cudaEventRecord(start, 0);
  multiplication<<<blocks, threads>>>(d_a, d_b, d_c, n);
  cudaEventRecord(stop, 0);

  cudaEventSynchronize(stop);

  float milliseconds = 0;
  cudaEventElapsedTime(&milliseconds, start, stop);

  cudaMemcpy(c, d_c, bytes, cudaMemcpyDeviceToHost);
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);

  cudaEventDestroy(start);
  cudaEventDestroy(stop);

  for(int i = 0; i < n; i++){
    for(int j = 0; j < n; j++){
      cout << c[i][j] << "  ";
    }
    cout << endl;
  }
  cout << "Kernel execution time: " << milliseconds << " ms" << endl;
  return 0;
}

Overwriting multiplication.cu


In [39]:
!nvcc -gencode arch=compute_75,code=sm_75 multiplication.cu -o matrix2D
!./matrix2D

40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  
40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  
40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  
40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  
40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  
40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  
40  40  40  40  40  40  40  40  40

In [43]:
%%writefile phonebook_search.cu
#include <bits/stdc++.h>
#include <cuda_runtime.h>
using namespace std;

__device__ int search_substring(const char *text, int text_length,
                                const char *pattern, int pattern_length){
    if(pattern_length == 0) return 1;
    if(text_length < pattern_length) return 0;

    for(int i = 0; i <= text_length - pattern_length; i++){
        int j = 0;
        while(j < pattern_length && text[i + j] == pattern[j]) j++;
        if(j == pattern_length) return 1;
    }
    return 0;
}

__global__ void search_phonebook(const char *phonebook, int lineWidth, int count,
                                 const char *pattern, int patternLen,
                                 int *result){
    int id = blockIdx.x * blockDim.x + threadIdx.x;
    if(id >= count) return;

    const char* line = phonebook + id * lineWidth;

    int len = 0;
    while(len < lineWidth && line[len] != '\0') len++;

    result[id] = search_substring(line, len, pattern, patternLen);
}

int main(){
    string filename = "phonebook1.txt";
    ifstream file(filename);
    if(!file){
        cerr << "File not found\n";
        return 1;
    }

    vector<string> lines;
    string line;
    while(getline(file, line)){
        lines.push_back(line);
    }
    file.close();

    int count = (int)lines.size();
    if(count == 0){
        cout << "No lines\n";
        return 0;
    }

    char name[100];
    cout << "Enter the name to search: ";
    cin.getline(name, 100);
    int patternLen = (int)strlen(name);

    const int LINE_WIDTH = 100;

    vector<char> phonebook(count * LINE_WIDTH, 0);
    for(int i = 0; i < count; i++){
        strncpy(&phonebook[i * LINE_WIDTH], lines[i].c_str(), LINE_WIDTH - 1);
    }

    char *d_phonebook, *d_pattern;
    int *d_result;

    cudaMalloc(&d_phonebook, phonebook.size());
    cudaMalloc(&d_pattern, patternLen + 1);
    cudaMalloc(&d_result, count * sizeof(int));

    cudaMemcpy(d_phonebook, phonebook.data(), phonebook.size(), cudaMemcpyHostToDevice);
    cudaMemcpy(d_pattern, name, patternLen + 1, cudaMemcpyHostToDevice);

    int threads = 256;
    int blocks = (count + threads - 1) / threads;

    search_phonebook<<<blocks, threads>>>(d_phonebook, LINE_WIDTH, count,
                                          d_pattern, patternLen, d_result);
    cudaDeviceSynchronize();

    vector<int> result(count);
    cudaMemcpy(result.data(), d_result, count * sizeof(int), cudaMemcpyDeviceToHost);

    for(int i = 0; i < count; i++){
        if(result[i]){
            cout << lines[i] << "\n";
        }
    }

    cudaFree(d_phonebook);
    cudaFree(d_pattern);
    cudaFree(d_result);
    return 0;
}

Overwriting phonebook_search.cu


In [44]:
!nvcc -O2 -gencode arch=compute_75,code=sm_75 phonebook_search.cu -o phonebook_search
!./phonebook_search


Enter the name to search: DOL
"MARIYA AZAD DOLA","016 14 266"
"DOLA RANI DEY","016 64 310"
"DOLON RANI DAS","014 38 502"
"DOLA BEGUM","016 45 463"
"NIPA RANI MONDOL","012 68 453"
