In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col
import numpy as np
import os


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=5dfc17760514fd5574e7b1e2ce93530e15520c26292428400e6b48807c0cddac
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [5]:
os.environ["SPARK_HOME"] = "/usr/local/lib/python3.10/dist-packages/pyspark"
os.environ["HADOOP_HOME"] = "C:/winutils"

# Creating spark session
spark = SparkSession.builder.appName("Crime Analytics").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

# Loading the dataset
KCDPFinal = spark.read.format("csv").option("header", True).option("inferSchema", True).option("delimiter", ",").load("/content/drive/MyDrive/DBMS_project_final/Source Code/1-Datasets/KCPD_Crime_Data/KCcrimeForAnalytics.csv").withColumnRenamed("Firearm_Used_Flag", "label")
KCDPFinal

# Create vector assembler for feature columns
VAssembler = VectorAssembler(inputCols=KCDPFinal.columns[1:19], outputCol="features")
KCDPFinal = VAssembler.transform(KCDPFinal)

# Split the crime dataset into training and testing data sets
trainingData, testingData = KCDPFinal.select("label", "features").randomSplit([0.7, 0.3])

# Using the training set for the model traning
from pyspark.ml.classification import DecisionTreeClassifier
DecisionTreeModel = DecisionTreeClassifier()
model = DecisionTreeModel.fit(trainingData)

# Generate prediction from test dataset
CrimepredKC = model.transform(testingData)

# Evuluate the accuracy of the model
evaluator = MulticlassClassificationEvaluator()
accuracy = evaluator.evaluate(CrimepredKC)

# Show model accuracy
print("Accuracy:", accuracy)

Accuracy: 0.9696151370503945
