In [2]:
# Decision Tree Model for Tree Health Prediction

#In this notebook, we will build a decision tree model to predict the health status of trees based on various features. We will use PySpark library to handle the data processing modeling.

## Data Loading and Preprocessing

In [3]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
# Create a SparkSession
spark = SparkSession.builder.appName("DecisionTree").getOrCreate()

In [6]:
# Load the data
df = spark.read.csv("TreeData.csv", header=True, inferSchema=True)

In [7]:
# Data Preprocessing
categorical_cols = ['curb_loc', 'steward', 'guards', 'sidewalk', 'root_stone', 'root_grate', 'root_other', 'trunk_wire', 'trnk_light', 'trnk_other', 'brch_light', 'brch_shoe', 'brch_other', 'spc_common']
numeric_cols = ['tree_dbh']

In [8]:
# String Indexing for categorical features
string_indexers = [StringIndexer(inputCol=col, outputCol=col+"_indexed", handleInvalid="keep") for col in categorical_cols]

In [9]:
# Vector Assembler
assembler = VectorAssembler(inputCols=numeric_cols + [col+"_indexed" for col in categorical_cols], outputCol="features")

In [10]:
# Label Indexing
label_indexer = StringIndexer(inputCol="health", outputCol="label", handleInvalid="keep")

In [11]:
# Pipeline
pipeline = Pipeline(stages=string_indexers + [assembler, label_indexer])
piped_data = pipeline.fit(df).transform(df)

In [12]:
## Train-Test Split and Model Training

In [13]:
# Split data into training and test sets
(training_data, test_data) = piped_data.randomSplit([0.7, 0.3], seed=42)

In [14]:
# Create a DecisionTree model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=5, minInstancesPerNode=10, minInfoGain=0.001)

In [16]:
# Train the model
model = dt.fit(training_data)

IllegalArgumentException: requirement failed: DecisionTree requires maxBins (= 32) to be at least as large as the number of values in each categorical feature, but categorical feature 14 has 133 values. Consider removing this and other categorical features with a large number of values, or add more training examples.

In [17]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=5, minInstancesPerNode=10, minInfoGain=0.001, maxBins=200)

In [18]:
# Train the model
model = dt.fit(training_data)

In [19]:
## Model Evaluation

In [20]:
# Make predictions
predictions = model.transform(test_data)

In [21]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % accuracy)

Accuracy = 0.82096


In [22]:
## Clean Up

In [23]:
# Stop the SparkSession
spark.stop()