In [39]:
# Importing necessary libraries

from pyspark.sql import SparkSession
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml import Pipeline
from pyspark.sql.functions import when
from pyspark.sql.functions import lit
from pyspark.ml.feature import VectorAssembler,StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql import Row

In [30]:
spark = SparkSession.builder.appName("AReM").getOrCreate()

# Read the feature data (X) from CSV
data = spark.read.csv("data.csv", header=True, inferSchema=True)

In [31]:
data.show()

+---------+---------+---------+---------+---------+---------+-------+
|avg_rss12|var_rss12|avg_rss13|var_rss13|avg_rss23|var_rss23|  label|
+---------+---------+---------+---------+---------+---------+-------+
|    36.75|     4.44|    13.33|     2.62|     12.5|      1.5|walking|
|     33.0|     4.95|     16.5|     4.56|    13.75|     2.59|walking|
|    31.75|     5.07|    16.75|     2.86|     17.5|     2.29|walking|
|    36.75|     4.92|    15.75|     2.28|      9.5|      4.5|walking|
|     34.5|     5.02|    14.75|     1.09|     18.5|     3.57|walking|
|     26.0|     3.24|     17.0|     3.08|    17.75|     1.79|walking|
|    40.75|      3.7|      9.5|     4.97|     12.0|     2.83|walking|
|    36.25|     0.43|    14.25|      3.7|    18.67|     2.62|walking|
|     35.0|     2.92|    13.67|     3.09|     11.0|      0.0|walking|
|    33.75|     8.26|    12.25|     1.92|    12.25|     6.94|walking|
|     31.0|     1.41|     12.0|      5.1|     19.0|     0.82|walking|
|     26.0|     3.67

## There are 2 classes, bending1 and bending2 which are similar, lets call them bending

In [36]:
data = data.withColumn("label", when((data["label"] == "bending1") | (data["label"] == "bending2"), "bending").otherwise(data["label"]))

In [37]:
train_data, test_data = data.randomSplit([0.7,0.3])

In [38]:
data.groupBy("label").count().show()

+--------+-----+
|   label|count|
+--------+-----+
| walking| 7200|
| cycling| 7198|
| bending| 5760|
|standing| 7200|
| sitting| 7199|
|   lying| 7200|
+--------+-----+



### We have 6 classes now.

## In activity recognition applications, correctly identifying both positive and negative instances is important. However, depending on the problem, the cost of false positives and false negatives may not be equal. For example, in medical diagnosis, a false negative (not detecting a disease when it's present) may be more costly than a false positive (detecting a disease when it's not present).

## In the case of activity recognition, it's important to minimize false negatives (i.e., failing to detect an activity that's actually being performed) in order to ensure accurate tracking of physical activity levels. Failing to detect certain activities could lead to inaccurate assessment of the user's overall physical activity, which could have negative impacts on healthcare decision-making and outcomes.

## Justification for Selecting F1 Score:
## The F1 score is the harmonic mean of precision and recall, which makes it a suitable choice for binary classification tasks, especially when dealing with imbalanced datasets. In the case of the Random Forest model, the F1 score is slightly higher than the other metrics, indicating a balanced trade-off between correctly identifying positive cases (precision) and capturing all positive cases (recall). This balance is important when the cost of false positives and false negatives is not significantly skewed

### **As per the project requirement, we will be running Decision Tree, Random Forest and Multilayer Perceptron and eventually evaluate the results**.

## Decision Tree

In [42]:
# Define feature columns (including the "label" column)
feature_columns = data.columns

# Exclude the "label" column
feature_columns.remove("label")

# Create a VectorAssembler
vector_assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Create a StringIndexer for label indexing
label_indexer = StringIndexer(inputCol="label", outputCol="indexed_label")

# Create a Decision Tree classifier
decision_tree = DecisionTreeClassifier(featuresCol="features", labelCol="indexed_label")

# Create a Pipeline
pipeline = Pipeline(stages=[vector_assembler, label_indexer, decision_tree])

# Fit the pipeline on the training data (assuming data is already split)
model = pipeline.fit(train_data)

# Make predictions on the test data
predictions = model.transform(test_data)

# Initialize the MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="indexed_label", predictionCol="prediction")

# Calculate accuracy, F1 score, recall, and precision
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})

# Create a new DataFrame with model name and evaluation metrics
results = spark.createDataFrame([Row(ModelName="DecisionTree", Accuracy=accuracy, F1=f1, Recall=recall, Precision=precision)])

# Show the results
results.show()

                                                                                

+------------+------------------+------------------+------------------+------------------+
|   ModelName|          Accuracy|                F1|            Recall|         Precision|
+------------+------------------+------------------+------------------+------------------+
|DecisionTree|0.6414507772020726|0.6375003350188141|0.6414507772020726|0.6485930520980879|
+------------+------------------+------------------+------------------+------------------+



## Random Forest

In [44]:
from pyspark.ml.classification import RandomForestClassifier

# Create a RandomForest classifier
random_forest = RandomForestClassifier(featuresCol="features", labelCol="indexed_label")

# Create a Pipeline for the RandomForest model
rf_pipeline = Pipeline(stages=[vector_assembler, label_indexer, random_forest])

# Fit the RandomForest model on the training data
rf_model = rf_pipeline.fit(train_data)

# Make predictions with the RandomForest model on the test data
rf_predictions = rf_model.transform(test_data)

# Initialize the MulticlassClassificationEvaluator for the RandomForest model
rf_evaluator = MulticlassClassificationEvaluator(labelCol="indexed_label", predictionCol="prediction")

# Calculate accuracy, F1 score, recall, and precision for the RandomForest model
rf_accuracy = rf_evaluator.evaluate(rf_predictions, {rf_evaluator.metricName: "accuracy"})
rf_f1 = rf_evaluator.evaluate(rf_predictions, {rf_evaluator.metricName: "f1"})
rf_recall = rf_evaluator.evaluate(rf_predictions, {rf_evaluator.metricName: "weightedRecall"})
rf_precision = rf_evaluator.evaluate(rf_predictions, {rf_evaluator.metricName: "weightedPrecision"})

# Create a new DataFrame with RandomForest model results
rf_results = spark.createDataFrame([Row(ModelName="RandomForest", Accuracy=rf_accuracy, F1=rf_f1, Recall=rf_recall, Precision=rf_precision)])

# Union the DataFrames to combine results
all_results = results.union(rf_results)

# Show the combined results
all_results.show()


+------------+------------------+------------------+------------------+------------------+
|   ModelName|          Accuracy|                F1|            Recall|         Precision|
+------------+------------------+------------------+------------------+------------------+
|DecisionTree|0.6414507772020726|0.6375003350188141|0.6414507772020726|0.6485930520980879|
|RandomForest|0.6761259465922679|0.6716172617717742|0.6761259465922679| 0.671193639541631|
+------------+------------------+------------------+------------------+------------------+



## Multilayer Perceptron

In [46]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

# Create a Multilayer Perceptron classifier
layers = [len(feature_columns), 10, 5, len(data.select("label").distinct().collect())]
mlp = MultilayerPerceptronClassifier(featuresCol="features", labelCol="indexed_label", layers=layers, seed=1234)

# Create a Pipeline for the MLP model
mlp_pipeline = Pipeline(stages=[vector_assembler, label_indexer, mlp])

# Fit the MLP model on the training data
mlp_model = mlp_pipeline.fit(train_data)

# Make predictions with the MLP model on the test data
mlp_predictions = mlp_model.transform(test_data)

# Initialize the MulticlassClassificationEvaluator for the MLP model
mlp_evaluator = MulticlassClassificationEvaluator(labelCol="indexed_label", predictionCol="prediction")

# Calculate accuracy, F1 score, recall, and precision for the MLP model
mlp_accuracy = mlp_evaluator.evaluate(mlp_predictions, {mlp_evaluator.metricName: "accuracy"})
mlp_f1 = mlp_evaluator.evaluate(mlp_predictions, {mlp_evaluator.metricName: "f1"})
mlp_recall = mlp_evaluator.evaluate(mlp_predictions, {mlp_evaluator.metricName: "weightedRecall"})
mlp_precision = mlp_evaluator.evaluate(mlp_predictions, {mlp_evaluator.metricName: "weightedPrecision"})

# Create a new DataFrame with MLP model results
mlp_results = spark.createDataFrame([Row(ModelName="MLP", Accuracy=mlp_accuracy, F1=mlp_f1, Recall=mlp_recall, Precision=mlp_precision)])

# Union the DataFrames to combine all results
all_results = all_results.union(mlp_results)

# Show the combined results
all_results.show()


+------------+------------------+------------------+------------------+------------------+
|   ModelName|          Accuracy|                F1|            Recall|         Precision|
+------------+------------------+------------------+------------------+------------------+
|DecisionTree|0.6414507772020726|0.6375003350188141|0.6414507772020726|0.6485930520980879|
|RandomForest|0.6761259465922679|0.6716172617717742|0.6761259465922679| 0.671193639541631|
|         MLP|0.6076524511757673|0.5797847862621367|0.6076524511757673|0.6060020664164829|
+------------+------------------+------------------+------------------+------------------+



Let's analyze the performance of each model:

**Decision Tree**:

Accuracy: 64.14%
F1 Score: 63.75%
Recall: 64.14%
Precision: 64.86%

**Performance Analysis**: The Decision Tree model performs reasonably well with an accuracy of 64.14%. It has a good balance between precision and recall, as indicated by the F1 score of 63.75%. It is a suitable choice if you prefer a straightforward model with decent performance.

**Random Forest**:

Accuracy: 67.61%
F1 Score: 67.16%
Recall: 67.61%
Precision: 67.12%

**Performance Analysis**: The Random Forest model outperforms the Decision Tree with an accuracy of 67.61% and an F1 score of 67.16%. The F1 score is an excellent metric for this model because it balances precision and recall, making it suitable for binary classification tasks with imbalanced data. The Random Forest's ability to handle complex relationships in the data and reduce overfitting makes it a robust choice.

**Multilayer Perceptron (MLP)**:

Accuracy: 60.77%
F1 Score: 57.98%
Recall: 60.77%
Precision: 60.60%

**Performance Analysis**: The MLP model has the lowest accuracy and F1 score among the three models, indicating that it may not be the best choice for this particular dataset. It shows relatively lower performance in terms of precision and recall, suggesting that it might struggle to capture the underlying patterns in the data.


Additionally, the Random Forest model often performs well on imbalanced datasets, as it can handle class imbalances and reduce overfitting. Therefore, the F1 score is a robust choice for evaluating and selecting the Random Forest model as the best model for this specific problem.

In summary, the **Random Forest** model with an F1 score of 67.16% is selected as the best model for this classification task due to its balanced performance and ability to handle imbalanced data.