In [416]:
#Linear SVM Using Spark MLlib
from pyspark.sql import SparkSession
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import col, when
from pyspark.ml import Pipeline
from pyspark.sql import functions as F


In [417]:
#Initialize Spark Sesson
spark = SparkSession.builder.appName("ChurnSVM").getOrCreate()

In [418]:
#Load dataset
churn = spark.read.csv("/Cell Phone Company Churn data-2.csv", header=True, inferSchema=True)

#Replace target so that True = 1 and False = 0
churn = churn.withColumn("churn", when(col("churn")== "TRUE", 1).otherwise(0))
churn.groupBy("churn").count().show()

+-----+-----+
|churn|count|
+-----+-----+
|    1|  483|
|    0| 2850|
+-----+-----+



In [419]:
# Separate the original DataFrame into minority and majority classes
minority_df = churn.filter(col("churn") == 1)
majority_df = churn.filter(col("churn") == 0)

# Get the counts for the sampling fraction
minority_count = minority_df.count()
majority_count = majority_df.count()

# Calculate the oversampling ratio
oversample_ratio = majority_count / minority_count

# Oversample the minority class by duplicating rows
minority_oversampled_df = minority_df.sample(fraction=oversample_ratio, withReplacement=True, seed=42)

# Combine the oversampled minority class with the original majority class
balanced_data = majority_df.union(minority_oversampled_df)

# Show the new counts to confirm the dataset is balanced
print("Balanced dataset counts:")
balanced_data.groupBy("churn").count().show()

Balanced dataset counts:
+-----+-----+
|churn|count|
+-----+-----+
|    0| 2850|
|    1| 2894|
+-----+-----+



In [420]:
#Features
feature = balanced_data.columns[:-1]
feature = [col for col in feature if col not in ["state","phone number", "international plan", "voice mail plan"]]
assembler = VectorAssembler(inputCols=feature, outputCol="features")

In [421]:
#Pipeline for Preprocessing

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
pipeline = Pipeline(stages=[assembler, scaler])

In [422]:
#Fit the Pipeline

pipelineModel = pipeline.fit(balanced_data)
processed = pipelineModel.transform(balanced_data)

In [423]:
#Split 80:20

train, test = processed.randomSplit([0.8, 0.2], seed=42)


In [424]:
#Create Linear SVM

lsvs = LinearSVC(featuresCol="scaledFeatures", labelCol="churn", maxIter=10, regParam=0.1)
model = lsvs.fit(train)

In [425]:
#Make Predictions
train_predict = model.transform(train)
test_predict = model.transform(test)

In [426]:
#Function to calculate metrics

def calculate_metrics(predictions):
  tp = predictions.filter((col("prediction") == 1) & (col("churn") == 1)).count()
  tn = predictions.filter((col("prediction") == 0) & (col("churn") == 0)).count()
  fp = predictions.filter((col("prediction") == 1) & (col("churn") == 0)).count()
  fn = predictions.filter((col("prediction") == 0) & (col("churn") == 1)).count()
  #Calculate Metrics
  accuracy = (tp + tn) / (tp + tn + fp + fn)
  sensitivity = tp / (tp + fn) if (tp + fn) != 0 else 0
  specificity = tn / (tn + fp) if (tn + fp) != 0 else 0
  balanced_accuracy = (sensitivity + specificity) / 2
  return accuracy, sensitivity, specificity, balanced_accuracy

In [427]:
# Calculate metrics for training data

train_accuracy, train_sensitivity, train_specificity, train_balanced_accuracy = calculate_metrics(train_predict)

print(f"Training Accuracy: {train_accuracy}")

print(f"Training Sensitivity: {train_sensitivity}")

print(f"Training Specificity: {train_specificity}")

print(f"Training Balanced Accuracy: {train_balanced_accuracy}")

Training Accuracy: 0.7177177177177178
Training Sensitivity: 0.7182675814751286
Training Specificity: 0.7171673819742489
Training Balanced Accuracy: 0.7177174817246887


In [428]:
# Calculate metrics for test data

test_accuracy, test_sensitivity, test_specificity, test_balanced_accuracy = calculate_metrics(test_predict)

print(f"Test Accuracy: {test_accuracy}")

print(f"Test Sensitivity: {test_sensitivity}")

print(f"Test Specificity: {test_specificity}")

print(f"Test Balanced Accuracy: {test_balanced_accuracy}")



Test Accuracy: 0.7024029574861368
Test Sensitivity: 0.708185053380783
Test Specificity: 0.6961538461538461
Test Balanced Accuracy: 0.7021694497673145


In [429]:
# Function to display confusion matrix

def display_confusion_matrix(predictions):
  confusion_matrix = predictions.crosstab("Churn", "prediction")
  confusion_matrix.show()

print("Training Confusion Matrix:")
display_confusion_matrix(train_predict)



print("Test Confusion Matrix:")
display_confusion_matrix(test_predict)

Training Confusion Matrix:
+----------------+----+----+
|Churn_prediction| 0.0| 1.0|
+----------------+----+----+
|               0|1671| 659|
|               1| 657|1675|
+----------------+----+----+

Test Confusion Matrix:
+----------------+---+---+
|Churn_prediction|0.0|1.0|
+----------------+---+---+
|               0|362|158|
|               1|164|398|
+----------------+---+---+



In [430]:
# Optional: Display a sample of predictions

test_predict.select("Churn", "prediction", "scaledFeatures").show(5)

# Stop the Spark session

spark.stop()



+-----+----------+--------------------+
|Churn|prediction|      scaledFeatures|
+-----+----------+--------------------+
|    0|       0.0|[1.02772891106296...|
|    0|       1.0|[1.30346105793351...|
|    0|       0.0|[1.30346105793351...|
|    0|       1.0|[1.52906008719123...|
|    0|       1.0|[1.95519158690026...|
+-----+----------+--------------------+
only showing top 5 rows



In [431]:
#Non Linear SVM Using SciPy (scikit-learn)
import numpy as np
import pandas as pd
from sklearn.svm import SVC # Corrected import
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, recall_score, precision_score

In [432]:
#load the dataset
churn = pd.read_csv("/Cell Phone Company Churn data-2.csv")

In [433]:
#Split
X= churn.drop(['churn', 'state', 'phone number', 'international plan', 'voice mail plan'], axis=1)
y= churn['churn']

In [434]:
# Split the data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Balance the data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Resampled training dataset shape:", Counter(y_train_resampled))

Resampled training dataset shape: Counter({False: 2284, True: 2284})


In [435]:
# Standardize the features

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)



In [436]:
# Train a non-linear SVM (SVC) model
model = SVC(kernel='rbf')
model.fit(X_train_scaled, y_train_resampled)



In [437]:
# Make predictions on the training and test data

y_train_pred = model.predict(X_train_scaled)

y_test_pred = model.predict(X_test_scaled)



In [438]:
# Function to calculate metrics

def calculate_metrics(y_true, y_pred):
  cm = confusion_matrix(y_true, y_pred)
  tn, fp, fn, tp = cm.ravel()
  accuracy = accuracy_score(y_true, y_pred)
  sensitivity = recall_score(y_true, y_pred)
  specificity = tn / (tn + fp)
  balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
  return accuracy, sensitivity, specificity, balanced_accuracy, cm

In [439]:
# Calculate metrics for training data

train_accuracy, train_sensitivity, train_specificity, train_balanced_accuracy, train_cm = calculate_metrics(y_train_resampled, y_train_pred)

print(f"Training Accuracy: {train_accuracy}")

print(f"Training Sensitivity: {train_sensitivity}")

print(f"Training Specificity: {train_specificity}")

print(f"Training Balanced Accuracy: {train_balanced_accuracy}")

print("Training Confusion Matrix:")

print(train_cm)

Training Accuracy: 0.899737302977233
Training Sensitivity: 0.8879159369527145
Training Specificity: 0.9115586690017513
Training Balanced Accuracy: 0.8997373029772329
Training Confusion Matrix:
[[2082  202]
 [ 256 2028]]


In [440]:
# Calculate metrics for test data

test_accuracy, test_sensitivity, test_specificity, test_balanced_accuracy, test_cm = calculate_metrics(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy}")

print(f"Test Sensitivity: {test_sensitivity}")

print(f"Test Specificity: {test_specificity}")

print(f"Test Balanced Accuracy: {test_balanced_accuracy}")

print("Test Confusion Matrix:")

print(test_cm)



Test Accuracy: 0.8575712143928036
Test Sensitivity: 0.6732673267326733
Test Specificity: 0.8904593639575972
Test Balanced Accuracy: 0.7818633453451352
Test Confusion Matrix:
[[504  62]
 [ 33  68]]
