# Cancer Diagnosis using Machine Learning (PySpark)

In [21]:
# Imports
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

In [22]:
# Initialize SparkContext
sc = SparkContext("local", "CancerDiagnosis")

# Initialize Spark session using SparkContext
spark = SparkSession(sc)

### Getting the data ready

Import the cancer data.

In [23]:
cancer_data = spark.read.csv("../data/cancer_data.csv", header=True, inferSchema=True)
cancer_data.limit(5).toPandas()

Unnamed: 0,id,diagnosis,Radius_mean,Texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,21.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


Check if there are any missing values.

In [24]:
# Get the sum of all missing values
{col: cancer_data.filter(cancer_data[col].isNull()).count() for col in cancer_data.columns}

{'id': 0,
 'diagnosis': 0,
 'Radius_mean': 0,
 'Texture_mean': 0,
 'perimeter_mean': 0,
 'area_mean': 0,
 'smoothness_mean': 0,
 'compactness_mean': 0,
 'concavity_mean': 0,
 'concave points_mean': 0,
 'symmetry_mean': 0,
 'fractal_dimension_mean': 0,
 'radius_se': 0,
 'texture_se': 0,
 'perimeter_se': 0,
 'area_se': 0,
 'smoothness_se': 0,
 'compactness_se': 0,
 'concavity_se': 0,
 'concave points_se': 0,
 'symmetry_se': 0,
 'fractal_dimension_se': 0,
 'radius_worst': 0,
 'texture_worst': 0,
 'perimeter_worst': 0,
 'area_worst': 0,
 'smoothness_worst': 0,
 'compactness_worst': 0,
 'concavity_worst': 0,
 'concave points_worst': 0,
 'symmetry_worst': 0,
 'fractal_dimension_worst': 0}

Convert the "diagnosis" column to numeric using StringIndexer

In [25]:
indexer = StringIndexer(inputCol="diagnosis", outputCol="label")
cancer_data_indexed = indexer.fit(cancer_data).transform(cancer_data)

Create the feature columns and feature vector

In [26]:
feature_columns = [col for col in cancer_data.columns if col != "diagnosis"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
cancer_data_assembled = assembler.transform(cancer_data_indexed)

Split the data into training and test sets.

In [27]:
(training_data, test_data) = cancer_data_assembled.randomSplit([0.75, 0.25], seed=42)

## Algorithm Comparison: Random Forest and Logistic Regression

Define the models.

In [28]:
rf_classifier = RandomForestClassifier(labelCol="label", featuresCol="features", seed=42)
lr_classifier = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10, regParam=0.3, elasticNetParam=0.8)

Create pipelines.

In [29]:
rf_pipeline = Pipeline(stages=[rf_classifier])
lr_pipeline = Pipeline(stages=[lr_classifier])

Train the models.

In [30]:
rf_model = rf_pipeline.fit(training_data)
lr_model = lr_pipeline.fit(training_datarf_predictions = rf_model.transform(test_data)
lr_predictions = lr_model.transform(test_data))

SyntaxError: invalid syntax. Perhaps you forgot a comma? (3433136368.py, line 2)

Make predictions.

In [16]:
rf_predictions = rf_model.transform(test_data)
lr_predictions = lr_model.transform(test_data)

Evaluate the models.

In [17]:
evaluator = BinaryClassificationEvaluator(labelCol="label")

rf_auc = evaluator.evaluate(rf_predictions)
lr_auc = evaluator.evaluate(lr_predictions)

print(f"Random Forest AUC: {rf_auc}")
print(f"Logistic Regression AUC: {lr_auc}")

Random Forest AUC: 0.9934375
Logistic Regression AUC: 0.9906250000000001


In [20]:
# Stop SparkContext
sc.stop()