In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import SVMWithSGD
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd

In [None]:
spark = SparkSession.builder.appName("SVM").getOrCreate()

In [None]:
df=pd.read_csv("Date_Fruit_Datasets.csv")
df=df.dropna()
df['AREA']=df['AREA'].astype(int)
from sklearn.preprocessing import LabelEncoder
lc=LabelEncoder()
df['Class']=lc.fit_transform(df['Class'])
df

In [None]:
data1 = spark.createDataFrame(df)

In [None]:
data1

In [None]:
cl=list(df.columns[0:34])

In [None]:
assembler = VectorAssembler(inputCols=cl,outputCol="features")

In [None]:
d=assembler.transform(data1)
d = d.select(['features','Class'])
d.show()

In [None]:
train,test=d.randomSplit([0.8,0.2],seed=42)
train.show(5)

In [None]:
label_train=[]
for row in train.collect():
    label_train.append(LabeledPoint(row["Class"],[float(i) for i in row["features"]]))
label_test=[]
for row in test.collect():
    label_test.append(LabeledPoint(row["Class"],[float(i) for i in row["features"]]))

In [None]:
rdd_train=spark.sparkContext.parallelize(label_train)
rdd_test=spark.sparkContext.parallelize(label_test)


In [None]:
svm=SVMWithSGD()
svm_model=svm.train(rdd_train)
predictions = rdd_test.map(lambda point: (float(svm_model.predict(point.features)), point.label))
predictions_df = spark.createDataFrame(predictions, ["prediction", "label"])

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions_df)

print("Accuracy:", accuracy)
