# Cancer Diagnosis using Machine Learning (PySpark)

In [41]:
# Imports
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# Convert the "diagnosis" column to numeric using StringIndexer
indexer = StringIndexer(inputCol="diagnosis", outputCol="label")
cancer_data_indexed = indexer.fit(cancer_data_assembled).transform(cancer_data_assembled)

# Now "label" is the numeric representation of "diagnosis"
(training_data, test_data) = cancer_data_indexed.randomSplit([0.75, 0.25], seed=42)

In [26]:
# Initialize SparkContext
sc = SparkContext("local", "CancerDiagnosis")

# Initialize Spark session using SparkContext
spark = SparkSession(sc)

### Getting the data ready

Import the cancer data.

In [27]:
cancer_data = spark.read.csv("../data/cancer_data.csv", header=True, inferSchema=True)
cancer_data.limit(5).toPandas()

Unnamed: 0,id,diagnosis,Radius_mean,Texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,21.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


Check if there are any missing values.

In [28]:
# Get the sum of all missing values
{col: cancer_data.filter(cancer_data[col].isNull()).count() for col in cancer_data.columns}

{'id': 0,
 'diagnosis': 0,
 'Radius_mean': 0,
 'Texture_mean': 0,
 'perimeter_mean': 0,
 'area_mean': 0,
 'smoothness_mean': 0,
 'compactness_mean': 0,
 'concavity_mean': 0,
 'concave points_mean': 0,
 'symmetry_mean': 0,
 'fractal_dimension_mean': 0,
 'radius_se': 0,
 'texture_se': 0,
 'perimeter_se': 0,
 'area_se': 0,
 'smoothness_se': 0,
 'compactness_se': 0,
 'concavity_se': 0,
 'concave points_se': 0,
 'symmetry_se': 0,
 'fractal_dimension_se': 0,
 'radius_worst': 0,
 'texture_worst': 0,
 'perimeter_worst': 0,
 'area_worst': 0,
 'smoothness_worst': 0,
 'compactness_worst': 0,
 'concavity_worst': 0,
 'concave points_worst': 0,
 'symmetry_worst': 0,
 'fractal_dimension_worst': 0}

Convert the "diagnosis" column to numeric using StringIndexer

Create the feature columns and feature vector

In [31]:
feature_columns = [col for col in cancer_data.columns if col != "diagnosis"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
cancer_data_assembled = assembler.transform(cancer_data)

Split the data into training and test sets.

In [36]:
(training_data, test_data) = cancer_data_indexed.randomSplit([0.75, 0.25], seed=42)

## Algorithm Comparison: Random Forest and Logistic Regression

In [37]:
# Define the models
rf_classifier = RandomForestClassifier(labelCol="diagnosis", featuresCol="features", seed=42)
lr_classifier = LogisticRegression(labelCol="diagnosis", featuresCol="features", maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [38]:
# Create pipelines
rf_pipeline = Pipeline(stages=[rf_classifier])
lr_pipeline = Pipeline(stages=[lr_classifier])

In [39]:
# Train the models
rf_model = rf_pipeline.fit(training_data)
lr_model = lr_pipeline.fit(training_data)

IllegalArgumentException: requirement failed: Column diagnosis must be of type numeric but was actually of type string.

In [42]:
# Stop SparkContext
sc.stop()