In [None]:
# Importing necessary modules.
!pip install pyspark
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
# Read data from the vehicle_stolen_dataset.csv
vehicle_data = pd.read_csv("/kaggle/input/vehicle-stolen-dataset/vehicle_stolen_dataset.csv", header=None)
vehicle_data.head()

In [None]:
#Sets the Spark master URL to run locally.
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
#Create SparkDataFrame
vehicle_df = spark.createDataFrame(vehicle_data)
vehicle_df.show(5)

In [None]:
# Rename the columns
vehicle_df = vehicle_df.select(col("0").alias("number_plate"),
             col("1").alias("brand"),
             col("2").alias("color"),
             col("3").alias("time"),
             col("4").alias("stoled"))

In [None]:
#Mapping a string column of labels to an ML column of label indices.
indexers = [StringIndexer(inputCol="brand", outputCol = "brand_index"),
 StringIndexer(inputCol="color", outputCol = "color_index"),
 StringIndexer(inputCol="time", outputCol = "time_index"),
 StringIndexer(inputCol="stoled", outputCol = "label")]

In [None]:
#Fitting a model to the input dataset.
pipeline = Pipeline(stages=indexers)
indexed_vehicle_df = pipeline.fit(vehicle_df).transform(vehicle_df)

In [None]:
indexed_vehicle_df.show(5,False)
#We have given False for turn off default truncation

In [None]:
vectorAssembler = VectorAssembler(inputCols = ["brand_index", "color_index", "time_index"],outputCol = "features")
vindexed_vehicle_df = vectorAssembler.transform(indexed_vehicle_df)
vindexed_vehicle_df.show(5, False)


In [None]:
# Splitting for training and testing
# optional value 42 is seed for sampling
splits = vindexed_vehicle_df.randomSplit([0.6,0.4], 42)
train_df = splits[0]
test_df = splits[1]

In [None]:
# Apply the Naive bayes classifier
nb = NaiveBayes(modelType="multinomial")

In [None]:
# train the model
nbmodel = nb.fit(train_df)

In [None]:
# select example rows to display.
predictions_df = nbmodel.transform(test_df)
predictions_df.show(5, True)

In [None]:
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
nbaccuracy = evaluator.evaluate(predictions_df)
print("Test accuracy = " + str(nbaccuracy))