In [None]:
%%file kmeans.py
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from scipy.sparse import vstack,csr_matrix
from nltk.stem import PorterStemmer
from scipy.spatial import distance
from collections import Counter
from mpi4py import MPI
import pandas as pd
import numpy as np
import scipy
import time

"""Read, Preprocess, TFIDF and Vectorizing of Data"""
def read_data():

    newsgroup_train = fetch_20newsgroups(subset = "train", remove = ('headers','footers','quotes'))
    text_df = pd.DataFrame([str(dat) for dat in newsgroup_train.data])
    text_df.columns = ["Text"]
    stemmer = PorterStemmer()
    text_df['stemmed'] = text_df["Text"].apply(lambda x: " ".join([stemmer.stem(str(y)) for y in x.split()]))
    vectorizer = TfidfVectorizer(stop_words = 'english')
    text_vector = vectorizer.fit_transform(text_df.stemmed.dropna())
    return text_vector,text_df

def read_datacsv():
    text_df = pd.read_csv("Stemmed_Main.csv")
    vectorizer = TfidfVectorizer(stop_words = 'english')
    vector = vectorizer.fit_transform(text_df.stemmed.dropna())
    return vector

"""Findining random initial centroid from the sparse data"""
def  initial_centroid(text_vector,k):
    centroid = []
    centroid_pos = list(np.random.randint(text_vector.shape[0],size=k))
    centroid = text_vector[centroid_pos[0],:].todense()
    for i in centroid_pos[1:]:
        temp = np.array(text_vector[i,:].todense())
        centroid = np.concatenate((centroid,temp))
    return(centroid)

"""Finding the euclidean distance between two instances"""
def euclidean_distance(text_vector,centroid):
    dist = np.sqrt(text_vector.dot(centroid.T))
    cluster = np.argmin(dist, axis=1)
    return(np.array(cluster))


"""Finding local sum by the workers"""
def centroid_mean(cluster,text_vector):
    group_sum = np.array([])
    clusters = np.unique(cluster)
    for clust in clusters:
        group = scipy.sparse.vstack([text_vector[j] for j in range(text_vector.shape[0]) if cluster[j] == clust])
        group_sum = scipy.sparse.vstack((group_sum,np.array(group.sum(axis = 0))))
    group_sum = (group_sum.tocsr())[1:,]
    return group_sum

"""Finding global mean by the master"""
def updated_centroid(cluster,text_vector):
    new_centroid = np.array([])
    clusters = np.unique(cluster)
    for clust in clusters:
        group = scipy.sparse.vstack([text_vector[j] for j in range(text_vector.shape[0]) if cluster[j] == clust])
        cnt = list(cluster).count(clust)
        new_centroid = scipy.sparse.vstack((new_centroid,np.array(group.sum(axis = 0)))) / cnt
    new_centroid = (new_centroid.tocsr())[1:,]
    return new_centroid


"""Initializing communicators"""
comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()


if __name__ == "__main__":
    text_vector = read_datacsv()
    k = 4
    centroid = initial_centroid(text_vector,k)                

    cluster = []
    time_list = []
    starttime = time.time()
    for epoch in range(10):
        if rank == 0:
            for i in range(1,size):  
                centroid_sum = np.array([])
                old_cluster = cluster
                rows = text_vector.shape[0]
                start = int((i-1)*(rows/(size-1)))
                end = int(((rows/(size-1))*i))
                comm.send(text_vector[start:end,:],dest = i,tag = 1)
                comm.send(centroid,dest = i,tag = 2)
                centroid_sum = scipy.sparse.vstack((centroid_sum,comm.recv(source = i, tag = 3)))
                cluster.extend(comm.recv(source = i,tag = 4))
            centroid = updated_centroid(cluster,text_vector)
            if (cluster == old_cluster).all():
                print('Converged')
                break
                
        else:
            text_vector = comm.recv(source = 0, tag = 1)
            centroid = comm.recv(source = 0, tag = 2)
            cluster = euclidean_distance(text_vector,centroid)
            centroid_sum = centroid_mean(cluster,text_vector)
            comm.send(centroid_sum,dest = 0,tag = 3)
            comm.send(cluster,dest = 0,tag = 4)
    time_list.append(time.time() - starttime)

In [None]:
!mpiexec -n 3 python kmeans.py

In [None]:
!mpiexec -n 3 python kmeans.py

In [None]:
P = [1,2,4,6,8]
Tp = [124.37,118.68,125.22,130.57,132.07]
Ts = 120.5
speedup = [Ts/i for i in Tp[:2]]
efficiency = [speed/p for speed,p in zip(speedup,P[:2])]
print("Process","   Tp","\t\t","Speedup","\t","Efficiency")
for i,j,k,l in zip(P,Tp,speedup,efficiency):
    print(i,"\t",j,"\t",round(k,2),"\t\t",round(l,2))

In [None]:
import matplotlib.pyplot as plt
plt.xlabel("Number of Processes")
plt.ylabel("Speedup")
plt.title("Linear Speedup")
plt.plot(P[:2],speedup)


In [None]:
P = [1,2,4,6,8]
Tp = [124.37,118.68,125.22,130.57,132.07]
Ts = 120.5
speedup = [Ts/i for i in Tp]
efficiency = [speed/p for speed,p in zip(speedup,P)]
print("Process","   Tp","\t\t","Speedup","\t","Efficiency")
for i,j,k,l in zip(P,Tp,speedup,efficiency):
    print(i,"\t",j,"\t",round(k,2),"\t\t",round(l,2))

In [None]:
import matplotlib.pyplot as plt
plt.plot(P,speedup)
plt.xlabel("Number of Processes")
plt.ylabel("Speedup")
plt.title("Sub-linear Speedup")