<a href="https://colab.research.google.com/github/saleh-imran/BigDataProcessing/blob/main/PySpak_on_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Load Data

In [1]:

# Import the necessary modules from the scikit-learn library
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load the iris dataset
iris = load_iris()

# Split the dataset into training and testing sets using train_test_split() function
# test_size = 0.2 means that 20% of the data will be used for testing and the remaining 80% for training
# random_state = 42 sets the random seed for reproducibility
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)



Import the k-NN classifier from scikit-learn and fit it to the training data:

In [2]:

# Import the KNeighborsClassifier module from scikit-learn library
from sklearn.neighbors import KNeighborsClassifier

# Define the number of neighbors to use
k = 3

# Instantiate a KNeighborsClassifier object with the value of k
knn = KNeighborsClassifier(n_neighbors=k)

# Fit the model using the training data
knn.fit(X_train, y_train)


Make Prediction

In [4]:

# Make predictions on the test data using the trained k-NN classifier
y_pred = knn.predict(X_test)


Evaluate Performance

In [11]:

# Import the accuracy_score module from the scikit-learn library
from sklearn.metrics import accuracy_score

# Calculate the accuracy of the model on the test data
# by comparing the predicted labels to the actual labels in the test set
accuracy = accuracy_score(y_test, y_pred)

print(accuracy)

1.0


**Pyspark for KNN**

In [49]:
pip install pyspark



In [56]:
#import Necessary Librabries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from pyspark.sql import Row
from pyspark.sql.functions import broadcast


In [16]:
#Create Spark Session
spark = SparkSession.builder \
    .appName("KNN for Iris Data") \
    .getOrCreate()

In [17]:
data = spark.read.csv("/content/iris.csv", header=True, inferSchema=True)


In [21]:
data.printSchema()
data = data.na.fill(0)
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="species", outputCol="label")


root
 |-- sepal.length: double (nullable = false)
 |-- sepal.width: double (nullable = false)
 |-- petal.length: double (nullable = false)
 |-- petal.width: double (nullable = false)
 |-- variety: string (nullable = true)



In [29]:
#Prepare Data
feature_cols = data.columns[:-1]
vector_assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = vector_assembler.transform(data)

In [24]:
#Split the Data into Training and TestSet
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)


In [52]:
def euclidean_distance(arr1, arr2):
    return float(sum((x - y) ** 2 for x, y in zip(arr1, arr2))) ** 0.5

distance_udf = udf(euclidean_distance, DoubleType())

In [53]:
#Build KNN model
def knn_predict(test_point, k):
    distances = iris_data.select('species', 'features', distance_udf('features', test_point).alias('distance'))
    k_nearest_neighbors = distances.orderBy('distance').limit(k)
    species_counts = k_nearest_neighbors.groupBy('species').count()
    prediction = species_counts.orderBy('count', ascending=False).first()['species']
    return prediction

In [64]:
def knn_predict(test_point, k):
    # Calculate the distance for each row in the DataFrame
    distance_col = 'distance'
    iris_data_with_distance = iris_data.withColumn(distance_col,
                                                   sum((col('features')[i] - test_point[i]) ** 2 for i in range(len(test_point))) ** 0.5)

    # Select the k-nearest neighbors and calculate majority species
    knn_species = iris_data_with_distance.orderBy(distance_col).limit(k).groupby('species').count()
    prediction = knn_species.orderBy(col('count').desc()).select('species').first()[0]

    return prediction