In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.ml.feature import PCA
from pyspark.ml.clustering import KMeans
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors

In [2]:
# read training files and test files, initialize spark and some parameters
spark = SparkSession \
    .builder \
    .appName('Cloud Computing Assignment2') \
    .getOrCreate()

train_datafile = 'hdfs://soit-hdp-pro-1.ucc.usyd.edu.au/share/MNIST/Train-28x28.csv'
train_labelfile= 'hdfs://soit-hdp-pro-1.ucc.usyd.edu.au/share/MNIST/Train-label.csv'
test_datafile = 'hdfs://soit-hdp-pro-1.ucc.usyd.edu.au/share/MNIST/Test-28x28.csv'
test_labelfile= 'hdfs://soit-hdp-pro-1.ucc.usyd.edu.au/share/MNIST/Test-label.csv'
num_train_samples = 60000
num_test_samples = 10000

train_data = spark.read.csv(train_datafile,header=False,inferSchema="true")
train_label = spark.read.csv(train_labelfile,header=False,inferSchema="true")
test_data = spark.read.csv(test_datafile,header=False,inferSchema="true")
test_label = spark.read.csv(test_labelfile,header=False,inferSchema="true")

In [3]:
# convert the data files into vectors and the label files into nparray
assembler1 = VectorAssembler(inputCols=train_data.columns,outputCol="train_features")
train_vectors = assembler1.transform(train_data).select("train_features")

assembler2 = VectorAssembler(inputCols=train_label.columns,outputCol="train_labels")
train_label_vectors = assembler2.transform(train_label).select("train_labels")
train_label_array = np.array(train_label_vectors.collect())
train_label_array=train_label_array.reshape(num_train_samples)

assembler3 = VectorAssembler(inputCols=test_data.columns,outputCol="test_features")
test_vectors = assembler3.transform(test_data).select("test_features")

assembler4 = VectorAssembler(inputCols=test_label.columns,outputCol="test_labels")
test_label_vectors = assembler4.transform(test_label).select("test_labels")
test_label_array = np.array(test_label_vectors.collect())
test_label_array=test_label_array.reshape(num_test_samples)

In [145]:
pca = PCA(k = 84, inputCol="train_features", outputCol="train_pca")
trainmodel = pca.fit(train_vectors)
train_pca_result = trainmodel.transform(train_vectors).select('train_pca')

pca2 = PCA(k = 84, inputCol="test_features", outputCol="test_pca")
testmodel = pca2.fit(test_vectors)
test_pca_result = testmodel.transform(test_vectors).select('test_pca')

In [146]:
train_pca_array = np.array(train_pca_result.collect())
train_pca_array=train_pca_array.reshape(num_train_samples,84)

test_pca_array = np.array(test_pca_result.collect())
test_pca_array=test_pca_array.reshape(num_test_samples,84)

In [150]:
nbrs = NearestNeighbors(n_neighbors=49, algorithm='auto').fit(train_pca_array)
distances, indices = nbrs.kneighbors(test_pca_array)

In [151]:
test_result = []
for i in indices:
    result_labels = {}
    for j in i:
        if train_label_array[j] in result_labels.keys():
            result_labels[train_label_array[j]] += 1
        else:
            result_labels[train_label_array[j]] = 1
    test_result.append(max(result_labels, key=result_labels.get))
test_result = np.array(test_result)
test_result[:10]

array([7., 0., 1., 0., 0., 1., 0., 2., 5., 7.])

In [152]:
correct_count = 0
for i in np.arange(num_test_samples):
    if test_result[i]==test_label_array[i]:
        correct_count += 1
acc = correct_count/10000
acc

0.2356

In [None]:
spark.stop()