In [1]:
! pip install mpi4py

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mpi4py
  Downloading mpi4py-3.1.4.tar.gz (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: mpi4py
  Building wheel for mpi4py (pyproject.toml) ... [?25l[?25hdone
  Created wheel for mpi4py: filename=mpi4py-3.1.4-cp39-cp39-linux_x86_64.whl size=3380630 sha256=6017256dd0fe6b7dbf2307b3438e71062c5bb0d10b9b9740452ba88c6d1fb128
  Stored in directory: /root/.cache/pip/wheels/db/81/9f/43a031fce121c845baca1c5d9a1468cad98208286aa2832de9
Successfully built mpi4py
Installing collected packages: mpi4py
Successfully installed mpi4py-3.1.4


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Point to Point

In [3]:
%%writefile LB4Point2Point.py

from mpi4py import MPI
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import timeit
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

def create_clusters(centroids, num_clusters, data):
    num_data_points, _ = np.shape(data)
    cluster_idx = np.empty(num_data_points)
    for point in range(num_data_points):
        cluster_idx[point] = closest_centroid(data[point], centroids, num_clusters)
    return cluster_idx

def compute_means(cluster_idx, num_clusters, data):
    centroids = np.empty((num_clusters, data.shape[1]))
    for k in range(num_clusters):
        cluster_points = data[cluster_idx == k] 
        centroids[k] = np.mean(cluster_points, axis=0)
    return centroids

def euclidean_distance(x1, x2):
    squared_distance = np.sum(np.power(x1 - x2, 2))
    distance = np.sqrt(squared_distance)
    return distance

def closest_centroid(data, centroids, num_clusters):
    distances = [euclidean_distance(data, centroid) for centroid in centroids]
    closest_idx = np.argmin(distances)
    return closest_idx  

def initialize_random_centroids(num_clusters, data):
    m, n = np.shape(data)
    centroids = data[np.random.choice(m, size=num_clusters, replace=False)]
    return centroids

def run_Kmeans(num_clusters, data, max_iterations=50):
    centroids = initialize_random_centroids(num_clusters, data)
    for _ in range(max_iterations):
        clusters = create_clusters(centroids, num_clusters, data)
        previous_centroids = centroids
        centroids = compute_means(clusters, num_clusters, data)
        if np.allclose(previous_centroids, centroids):
            return clusters, centroids
    return clusters, centroids

def score_within_cluster_dispersion(cluster, data_clusters):
    cluster_data = data_clusters[cluster]
    cluster_size = cluster_data.shape[0]
    cluster_dispersion = np.sum(np.var(cluster_data, ddof=0, axis=0))
    return cluster_size * cluster_dispersion

def calculate_index(data):
    data_clusters = {}
    data_features = data.copy()    
    label_target = 'XCoord'
    data_target = data_features.pop(label_target)
    labels_clusters = np.unique(data_target)
    num_clusters = len(labels_clusters)

    data_frame = data.copy()

    for cluster in labels_clusters:
        data_clusters[cluster] = data_frame[data_frame[label_target] == cluster].drop(columns=label_target)

    num_observation_for_specific_cluster = {cluster: len(data_clusters[cluster])
             for cluster in labels_clusters}
    
    B = pd.DataFrame()
    data_centroids = data_frame.groupby(by=label_target).mean().T
    data_barycenter = data_features.mean()

    for cluster in labels_clusters:
        B = B.append(np.sqrt(num_observation_for_specific_cluster[cluster]) *
        (data_centroids[cluster] - data_barycenter), ignore_index=True)
    
    scatter_matrix_between_group_BG = B.T.dot(B)

    score_between_group_dispersion = np.trace(scatter_matrix_between_group_BG)

    BGSS_red = score_between_group_dispersion / (num_clusters - 1)

    score_pooled_within_cluster_dispersion = np.sum([score_within_cluster_dispersion(cluster, data_clusters) for cluster in labels_clusters])
    
    num_observations = len(data_features)
    WGSS_red = score_pooled_within_cluster_dispersion / (num_observations - num_clusters)

    index = BGSS_red / WGSS_red

    return index

def main():
    start_timer = timeit.default_timer()   

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()

    if rank == 0:
        data = pd.read_csv('/content/drive/MyDrive/Datasets/brooklyn_sales_map.csv', low_memory=False)
        data.drop(data.columns.difference(['XCoord','YCoord']), 1, inplace=True)
        df = data[data.XCoord.notnull()]
        coordinates = df.head(1000)

        mind = [0 for i in range(size)]

        for i in range(size - 1):
            n = int(coordinates.size / size)
            rndperm = np.random.permutation(coordinates.shape[0])
            send_buf = coordinates.iloc[rndperm[0:500], :].copy()
            
            comm.send(n, dest = i + 1, tag = 13)
            comm.send(send_buf, dest = i + 1, tag = 12)        
            mind[i + 1] = comm.recv(tag = 12, source = i + 1)
        
        max_val = 0
        max_index = 0
        
        for i in range(size - 1):
            if (mind[i + 1] > max_val):
                max_index = i + 1
                max_val = mind[i + 1]

        print('Max index: ', max_index)
        print('Max value: ',  max_val)

        for i in range(size - 1): 
            if (i + 1 == max_index):
                comm.send(1, dest = i + 1, tag = 11)
                centroids = comm.recv(source = i + 1, tag = 11)  
                print('Centroids: ', centroids) 
            else:
                comm.send(0, dest = i + 1, tag = 11)
                ans = comm.recv(source = i + 1, tag = 11)

        time = timeit.default_timer()-start_timer
        print('Running time: {:2.4f} sec'.format(time))

    else:
        num_data = comm.recv(source=0, tag = 13)
        coordinates = np.empty(num_data, dtype='f')
        coordinates = comm.recv(source=0, tag = 12)

        scaler = MinMaxScaler()
        df = coordinates.drop(['XCoord'], axis=1)
        df = scaler.fit_transform(df)
        
        kmeans, centroids = run_Kmeans(4, df)
        ch_index = calculate_index(coordinates)
        comm.send(ch_index, dest = 0, tag = 12)

        task = comm.recv(source = 0, tag = 11)
        if (task == 1):
            comm.send(centroids, dest = 0, tag = 11)
        else:
            comm.send(0, dest = 0, tag = 11)

main()  

Writing LB4Point2Point.py


In [4]:
! mpirun -n 8 --allow-run-as-root --oversubscribe python LB4Point2Point.py

Max index:  3
Max value:  100.60982064305024
Centroids:  [[0.        ]
 [0.87093231]
 [0.9420562 ]
 [0.75915894]]
Running time: 157.5249 sec
