<a href="https://colab.research.google.com/github/sidhant82/Blood-Bank-MERN-Stack-Project/blob/main/Cricket_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%writefile cricket_csv.cu

#include<stdlib.h>
#include<strings.h>
#include<chrono>
#include<iostream>
#include<cuda_runtime.h>
#include<cmath> // for pow and sqrt


#define MAX_LINE_LENGTH 5000
#define MAX_RECORDS 100000


//cuda kernel to perfrom linear regression calculations

__global__ void linear_regression(float *runs, float *balls, float *x_sum, float *y_sum, float *xy_sum, float *xx_sum, int n){
    //shared memory for partial  reductions within each block
    __shared__ float local_x_sum[1024];
    __shared__ float local_y_sum[1024];
    __shared__ float local_xy_sum[1024];
    __shared__ float local_xx_sum[1024];

    int tid = threadIdx.x;
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    //Initialize shared memory
    local_x_sum[tid] = 0;
    local_y_sum[tid] = 0;
    local_xy_sum[tid] = 0;
    local_xx_sum[tid] = 0;


    //Each thread computes its partial sum
    if(i < n){
        local_x_sum[tid] += balls[i];
        local_y_sum[tid] += runs[i];
        local_xy_sum[tid] += runs[i] * balls[i];
        local_xx_sum[tid] += balls[i] * balls[i];
    }

//Synchronize threads within block
    __syncthreads();

    // Reduction within the block
    for(int stride = blockDim.x / 2; stride > 0; stride /= 2){
        if(tid < stride){
            local_x_sum[tid] += local_x_sum[tid + stride];
            local_y_sum[tid] += local_y_sum[tid + stride];
           local_xy_sum[tid] += local_xy_sum[tid + stride];
           local_xx_sum[tid] += local_xx_sum[tid + stride];
        }
        __syncthreads();
    }

    //Accumulate block-level sums global memory
    if(tid == 0)
    {
        atomicAdd(x_sum, local_x_sum[0]);
        atomicAdd(y_sum, local_y_sum[0]);
        atomicAdd(xy_sum, local_xy_sum[0]);
        atomicAdd(xx_sum, local_xx_sum[0]);
    }
}


int main()
{
    printf("*** program to run linear regression on ODI Cricket Dataset ***\n");
    FILE *fp = fopen("cricket.csv", "r");
    if (fp == NULL)
    {
        printf("Error opening file\n");

        return EXIT_FAILURE;
    }
    char line[MAX_LINE_LENGTH];
    fgets(line, MAX_LINE_LENGTH, fp); // Skip the header line


    //Allocate dynamic memory for runs and balls columns
    float *h_runs = (float *)malloc(MAX_RECORDS * sizeof(float));
    float *h_balls = (float *)malloc(MAX_RECORDS * sizeof(float));


    if (h_runs == NULL || h_balls == NULL)
    {
        printf("failed to allocate memory\n");
        fclose(fp);
        return EXIT_FAILURE;
    }

    long record_count = 0;

    //read the Runs and Balls columns from the csv file
    while (fgets(line, MAX_LINE_LENGTH, fp) != NULL && record_count <MAX_RECORDS)
    {
        char *token = strtok(line, ",");
        token = strtok(NULL, ","); // Skip the first column (Player Name)
        h_runs[record_count] = atof(token);//read runs

        token = strtok(NULL, ","); // go past runs
         token = strtok(NULL, ","); // skip not out
        h_balls[record_count] = atof(token); //read balls Faced

        record_count++;
    }
    fclose(fp);

    //Allocate memory on the GPU
    float *d_runs, *d_balls;
    float *d_x_sum, *d_y_sum, *d_xy_sum, *d_xx_sum;

    cudaMalloc((void **)&d_runs, record_count * sizeof(float));
    cudaMalloc((void **)&d_balls, record_count * sizeof(float));
    cudaMalloc((void **)&d_x_sum, sizeof(float));
    cudaMalloc((void **)&d_y_sum, sizeof(float));
    cudaMalloc((void **)&d_xy_sum, sizeof(float));
    cudaMalloc((void **)&d_xx_sum, sizeof(float));

    //copy data from host to device
    cudaMemcpy(d_runs, h_runs, record_count * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_balls, h_balls, record_count * sizeof(float), cudaMemcpyHostToDevice);


    //Initialize  the sums on the device
    cudaMemset(d_x_sum, 0, sizeof(float));
    cudaMemset(d_y_sum, 0, sizeof(float));
    cudaMemset(d_xy_sum, 0, sizeof(float));
    cudaMemset(d_xx_sum, 0, sizeof(float));

    int blockSize = 1024;
    int numBlocks = (record_count + blockSize - 1) / blockSize;

    auto start_time = std::chrono::high_resolution_clock::now();

    linear_regression<<<numBlocks, blockSize>>>(d_runs, d_balls, d_x_sum, d_y_sum, d_xy_sum, d_xx_sum, record_count);
    auto end_time = std::chrono::high_resolution_clock::now();
    cudaDeviceSynchronize();

    auto duration_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - start_time).count();

  std::cout << "Time taken by GPU for regression: " << duration_ns /1000 << " microseconds" << std::endl;

    //Copy the sums back to host
    float h_x_sum, h_y_sum, h_xy_sum, h_xx_sum;
    cudaMemcpy(&h_x_sum, d_x_sum, sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(&h_y_sum, d_y_sum, sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(&h_xy_sum, d_xy_sum, sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(&h_xx_sum, d_xx_sum, sizeof(float), cudaMemcpyDeviceToHost);


    //compute slope and intercept
    float slope = (record_count * h_xy_sum - h_x_sum * h_y_sum) / (record_count * h_xx_sum - h_x_sum * h_x_sum);

    float intercept = (h_y_sum - slope * h_x_sum) / record_count;

    //Print the results
    printf("Slope: %.2f\n", slope);
    printf("Intercept: %.2f\n", intercept);

    //performance evaluation metrics
    float mse = 0.0, y_mean = 0.0, ss_tot = 0.0, ss_residual = 0.0;
    for(long i = 0; i < record_count; ++io){
        float y_pred = slope * h_balls[i] + intercept;
        mse += (h_runs[i] - y_pred) * (h_runs[i] - y_pred);
        y_mean += h_runs[i];
    }

    mse /= record_count; //Mean squared error
    float rmse = sqrt(mse); //Root mean squared error

    y_mean /= record_count; //Average of actual values

    //Compute R-squared
    for(long i = 0; i < record_count; ++i){
        float y_pred = slope * h_balls[i] + intercept;
        ss_tatal += pow((h_runs[i] - y_mean), 2); //Total sum of squares

        ss_residual += pow((h_runs[i] - y_pred), 2); //Residual sum of squares
    }

    float r_squared = 1 - (ss_residual / ss_total);

    //Print evaluation metrics
    printf("Mean Squared Error (MSE): %.2f\n", mse);
    printf("Root Mean Squared Error (RMSE): %.2f\n", rmse);
    printf("R-squared: %.2f\n", r_squared);


    //free allocated memory
    cudaFree(d_runs);
    cudaFree(d_balls);
    cudaFree(d_x_sum);
    cudaFree(d_y_sum);
    cudaFree(d_xy_sum);
    cudaFree(d_xx_sum);

    free(h_runs);
    free(h_balls);

    return EXIT_SUCCESS;
    }

