In [1]:
# Import necessary PySpark libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize a Spark session
spark = SparkSession.builder.appName("Sonar Rock vs Mine").getOrCreate()

In [2]:
# Load the CSV file into a PySpark DataFrame
sonar_data = spark.read.csv("sonar data.csv", header=False, inferSchema=True)

# Show the first few rows of the dataset
sonar_data.show(5)

+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+----+
|   _c0|   _c1|   _c2|   _c3|   _c4|   _c5|   _c6|   _c7|   _c8|   _c9|  _c10|  _c11|  _c12|  _c13|  _c14|  _c15|  _c16|  _c17|  _c18|  _c19|  _c20|  _c21|  _c22|  _c23|  _c24|  _c25|  _c26|  _c27|  _c28|  _c29|  _c30|  _c31|  _c32|  _c33|  _c34|  _c35|  _c36|  _c37|  _c38|  _c39|  _c40|  _c41|  _c42|  _c43|  _c44|  _c45|  _c46|  _c47|  _c48|  _c49|  _c50|  _c51|  _c52|  _c53|  _c54|  _c55|  _c56|  _c57|  _c58|  _c59|_c60|
+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-----

In [3]:
# Get the number of rows and columns
num_rows = sonar_data.count()
num_columns = len(sonar_data.columns)
print(f"Rows: {num_rows}, Columns: {num_columns}")


Rows: 208, Columns: 61


In [4]:
# Get descriptive statistics
sonar_data.describe().show()


+-------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+-------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------

In [5]:
# Count unique values in column 60
sonar_data.groupBy("_c60").count().show()


+----+-----+
|_c60|count|
+----+-----+
|   M|  111|
|   R|   97|
+----+-----+



In [6]:
# Calculate the mean for each group in column 60
sonar_data.groupBy("_c60").mean().show()


+----+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+-------------------+-------------------+-------------------+------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+------

In [7]:
from pyspark.ml.feature import StringIndexer

# Convert categorical labels to numerical indices
label_encoder = StringIndexer(inputCol="_c60", outputCol="label")
sonar_data_with_labels = label_encoder.fit(sonar_data).transform(sonar_data)

# Extract the encoded labels
Y_encoded = sonar_data_with_labels.select("label")
Y_encoded.show(5)


+-----+
|label|
+-----+
|  1.0|
|  1.0|
|  1.0|
|  1.0|
|  1.0|
+-----+
only showing top 5 rows



In [8]:
# Split the data into training and testing sets (90% training, 10% testing)
train_data, test_data = sonar_data_with_labels.randomSplit([0.9, 0.1], seed=1)


In [9]:
# Combine feature columns into a single vector
assembler = VectorAssembler(inputCols=sonar_data.columns[:-1], outputCol="features")
train_data_transformed = assembler.transform(train_data)
test_data_transformed = assembler.transform(test_data)

In [10]:
# Initialize and train the Logistic Regression model
lr = LogisticRegression(featuresCol="features", labelCol="label")
model = lr.fit(train_data_transformed)

In [11]:
# Transform the data to get predictions
train_predictions = model.transform(train_data_transformed)
test_predictions = model.transform(test_data_transformed)

In [12]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize evaluator for accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

# Calculate accuracy for training and testing datasets
train_accuracy = evaluator.evaluate(train_predictions)
test_accuracy = evaluator.evaluate(test_predictions)

print(f"Training Data Accuracy: {train_accuracy}")
print(f"Testing Data Accuracy: {test_accuracy}")


Training Data Accuracy: 1.0
Testing Data Accuracy: 0.85
