In [0]:
#pip install pyspark

In [0]:
#Part2

In [0]:
import sys
from typing import Iterable, List
import numpy as np
from pyspark.sql import SparkSession

# Set the number of dimensions to 32 for the Breast Cancer dataset features
D = 32

def readPointBatch(iterator: Iterable[str]) -> List[np.ndarray]:
    strs = list(iterator)
    matrix = np.zeros((len(strs), D + 1))  # Adjusted for D features + 1 label
    for i, s in enumerate(strs):
        # Assuming the CSV has no header and the label is in the first column
        matrix[i] = np.fromstring(s, dtype=np.float32, sep=',')
    return matrix
    


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: logistic_regression <file> <iterations>", file=sys.stderr)
        sys.exit(-1)

    spark = SparkSession.builder.appName("PythonLR").getOrCreate()
    sc = spark.sparkContext

points = sc.textFile('/FileStore/tables/breastcancer_data.csv').filter(lambda line: isinstance(line, str) and not line.startswith('id')).mapPartitions(readPointBatch).cache()

iterations = 100 

    # Initialize weights w to a random value
w = 2 * np.random.rand(1,D) - 1

print("Initial w: " + str(w))

def gradient(matrix: np.ndarray, w: np.ndarray) -> np.ndarray:
    print('matrix shape', matrix.shape())

    Y = matrix[:, 0]  # Extract labels
    X = matrix[:, 1:]  # Extract features

    print("Shape of the matrix:", matrix.shape)
    print("Shape of Y:", Y.shape)
    print("Shape of X:", X.shape)
    # Compute the gradient
    grad = ((1.0 / (1.0 + np.exp(-Y * X.dot(w))) - 1.0) * Y[:, None] * X).sum(0)

    return grad 
    
def add(x: np.ndarray, y: np.ndarray) -> np.ndarray:
    x += y

    return x

for i in range(iterations):
    print(f"On iteration {i + 1}")
    # Initialize weights w to a random value
    w-= points.map(lambda m: gradient (m, w) ).reduce(add)
    print("Final w: " + str(w))

spark.stop()

In [0]:
#Part 3

In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Initialize a Spark session
spark = SparkSession.builder.appName("LogisticRegressionBreastCancer").getOrCreate()

# Read the breast cancer dataset
data = spark.read.csv("/FileStore/tables/breast_cancer.csv", header=True, inferSchema=True)

# Select relevant features and target column
selected_data = data.select("diagnosis", "radius_mean", "texture_mean", "perimeter_mean", "area_mean", 
                            "smoothness_mean", "compactness_mean", "concavity_mean", "concave points_mean", 
                            "symmetry_mean", "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se", 
                            "area_se", "smoothness_se", "compactness_se", "concavity_se", "concave points_se", 
                            "symmetry_se", "fractal_dimension_se", "radius_worst", "texture_worst", 
                            "perimeter_worst", "area_worst", "smoothness_worst", "compactness_worst", 
                            "concavity_worst", "concave points_worst", "symmetry_worst", "fractal_dimension_worst")

# Drop rows with missing values
selected_data = selected_data.na.drop()

# Assemble features into a vector
feature_cols = ["radius_mean", "texture_mean", "perimeter_mean", "area_mean", 
                "smoothness_mean", "compactness_mean", "concavity_mean", "concave points_mean", 
                "symmetry_mean", "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se", 
                "area_se", "smoothness_se", "compactness_se", "concavity_se", "concave points_se", 
                "symmetry_se", "fractal_dimension_se", "radius_worst", "texture_worst", 
                "perimeter_worst", "area_worst", "smoothness_worst", "compactness_worst", 
                "concavity_worst", "concave points_worst", "symmetry_worst", "fractal_dimension_worst"]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
assembled_data = assembler.transform(selected_data)

# Split the data into training and testing sets
train_data, test_data = assembled_data.randomSplit([0.7, 0.3], seed=42)

# Initialize the Logistic Regression model
lr = LogisticRegression(featuresCol="features", labelCol="diagnosis")

# Fit the model to the training data
lr_model = lr.fit(train_data)

# Make predictions on both the train and test data
train_predictions = lr_model.transform(train_data)
test_predictions = lr_model.transform(test_data)

# Evaluate the model using a BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="diagnosis")

# Calculate the train and test accuracy
train_accuracy = evaluator.evaluate(train_predictions)
test_accuracy = evaluator.evaluate(test_predictions)

# Print the accuracies
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Stop the Spark session
spark.stop()
