# Milestone 4

In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [2]:
###### Define train-test split ######
train_split = 0.8
test_split = 0.2

In [11]:
###### Set up session & data ######
# Initialize Spark session
spark = SparkSession.builder.appName("DecisionTreeJob").getOrCreate()

# Import the data as an RDD
csv_rdd = spark.sparkContext.textFile("gs://bigdataprojectbucketsnew1/flight_encoded_rdd.csv/part-00000-555f6d26-eb7c-4074-8f48-1da198d98ad1-c000.csv")

# Convert the RDD to a DataFrame
csv_rows = csv_rdd.map(lambda line: line.split(","))
header = csv_rows.first()  # Extract header
csv_data = csv_rows.filter(lambda row: row != header).map(lambda row: Row(**{header[i]: float(row[i]) for i in range(len(header))}))

In [12]:
# Create a DataFrame from the RDD of Rows
sampled_df = spark.createDataFrame(csv_data)

# Limit the number of rows to 100
#sampled_df = sampled_df.limit(50)

In [13]:
sampled_df.count()
#3202

3202

In [15]:
# Define feature columns
feature_columns = sampled_df.columns[:-1]  # Assuming the last column is the target

# Define the target column
target_column = 'Delay'

In [16]:
# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
#sampled_df_model = assembler.transform(sampled_df)
# Define a simple pipeline
pipeline = Pipeline(stages=[assembler])

# Fit the pipeline to the data
pipelineModel = pipeline.fit(sampled_df)

# Transform the data with the pipeline
sampled_df_transformed = pipelineModel.transform(sampled_df)
# Select features and target column for modeling
sampled_df_model = sampled_df_transformed.select("features", target_column)

# Split data into training and test sets
train_df, test_df = sampled_df_model.randomSplit([train_split, test_split])

In [28]:
len(sampled_df.columns)

115

In [27]:
train_df

DataFrame[features: vector, Delay: double]

# 1. Logistic Regression

In [16]:
# Train a logistic regression model on the subset of data
lr = LogisticRegression(featuresCol='features', labelCol=target_column)
lr_Model = lr.fit(train_df)

24/04/09 10:22:50 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/04/09 10:22:50 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


In [17]:
# Evaluate the model on the training set by accuracy
train_predictions = lr_Model.transform(train_df)
# by accuracy
evaluator_acc = MulticlassClassificationEvaluator(labelCol=target_column, predictionCol="prediction", metricName="accuracy")
accuracy = evaluator_acc.evaluate(train_predictions)
print(f"Training Accuracy: {accuracy}")

# Evaluate the model on the test set by accuracy
predictions = lr_Model.transform(test_df)
# by accuracy
accuracy = evaluator_acc.evaluate(predictions)
print(f"Test Accuracy: {accuracy}")

Training Accuracy: 1.0
Test Accuracy: 1.0


## 1.1 Hyperparameter Tuning

In [23]:
# Define parameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

In [24]:
# Define evaluator
#evaluator = BinaryClassificationEvaluator(labelCol="Delay", metricName="accuracy")
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="Delay")

# Set up 5-fold cross-validation
crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)

# Run cross-validation, and choose the best set of parameters
cvModel = crossval.fit(train_df)

# Fetch the best model
bestModel = cvModel.bestModel

# Make predictions on the test set
predictions = bestModel.transform(test_df)

24/04/09 10:35:10 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
24/04/09 10:35:10 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
                                                                                

In [25]:
# Calculate the area under the ROC curve (AUC)
auc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
print(f"Area Under ROC: {auc}")

Area Under ROC: 1.0


In [28]:
# Evaluate the best model's performance
accuracy = evaluator.evaluate(predictions)
print(f"Best Model Accuracy: {accuracy}")

# Optional: You can also view the best model's parameters
best_regParam = bestModel._java_obj.getRegParam()
best_elasticNetParam = bestModel._java_obj.getElasticNetParam()
print(f"Best Reg Param: {best_regParam}, Best Elastic Net Param: {best_elasticNetParam}")

Best Model Accuracy: 1.0
Best Max Depth: 0.01, Best Max Bins: 0.0


# 2. Decision Tree

In [34]:
###### Modeling ######
# Initialize the DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol=target_column, featuresCol="features")

# Train the decision tree model
dt_model = dt.fit(train_df)

# Evaluate the model on the training set by accuracy
train_predictions = dt_model.transform(train_df)
# by accuracy
evaluator_acc = MulticlassClassificationEvaluator(labelCol=target_column, predictionCol="prediction", metricName="accuracy")
accuracy = evaluator_acc.evaluate(train_predictions)
print(f"Training Accuracy: {accuracy}")

# Evaluate the model on the test set by accuracy
predictions = dt_model.transform(test_df)
# by accuracy
accuracy = evaluator_acc.evaluate(predictions)
print(f"Test Accuracy: {accuracy}")

# Calculate the area under the ROC curve (AUC)
auc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
print(f"Area Under ROC: {auc}")

Training Accuracy: 1.0
Test Accuracy: 1.0
Area Under ROC: 1.0


## 2.1 Hyperparameter Tuning

In [12]:
# Define a grid of hyperparameters to search
param_grid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [5, 10, 15]) \
    .addGrid(dt.maxBins, [20, 30, 40]) \
    .build()

# Set up the cross-validation
evaluator = MulticlassClassificationEvaluator(labelCol="Delay", predictionCol="prediction", metricName="accuracy")
crossval = CrossValidator(estimator=dt,
                          estimatorParamMaps=param_grid,
                          evaluator=evaluator,
                          numFolds=3)  # You can adjust the number of folds as needed

# Run cross-validation to find the best hyperparameters
cv_model = crossval.fit(train_df)

# Make predictions on the test data using the best model
best_model = cv_model.bestModel
predictions = best_model.transform(test_df)

                                                                                

In [13]:
# Evaluate the best model's performance
accuracy = evaluator.evaluate(predictions)
print(f"Best Model Accuracy: {accuracy}")

# Optional: You can also view the best model's parameters
best_maxDepth = best_model._java_obj.getMaxDepth()
best_maxBins = best_model._java_obj.getMaxBins()
print(f"Best Max Depth: {best_maxDepth}, Best Max Bins: {best_maxBins}")

Best Model Accuracy: 1.0
Best Max Depth: 5, Best Max Bins: 20


# 3. Random Forest Classifier

In [30]:
#RandomForestClassifier

In [35]:
# Train a RandomForestClassifier model
rf = RandomForestClassifier(labelCol=target_column, featuresCol="features")
rf_model = rf.fit(train_df)

In [36]:
# Evaluate the model on the training set by accuracy
train_predictions = rf_model.transform(train_df)
evaluator_acc = MulticlassClassificationEvaluator(labelCol=target_column, predictionCol="prediction", metricName="accuracy")
train_accuracy = evaluator_acc.evaluate(train_predictions)
print(f"Training Accuracy: {train_accuracy}")

# Evaluate the model on the test set by accuracy
test_predictions = rf_model.transform(test_df)
test_accuracy = evaluator_acc.evaluate(test_predictions)
print(f"Test Accuracy: {test_accuracy}")

# Calculate the area under the ROC curve (AUC)
evaluator_auc = BinaryClassificationEvaluator(labelCol=target_column, rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc = evaluator_auc.evaluate(test_predictions)
print(f"Area Under ROC: {auc}")

Training Accuracy: 0.9737151824244802
Test Accuracy: 0.9509954058192955
Area Under ROC: 0.9993418743418743


## 3.1 Feature importances analysis

In [40]:
# Get feature importances
feature_importances = rf_model.featureImportances

# Print feature importances
print("Feature Importances:")
for feature, importance in zip(feature_columns, feature_importances):
    print(f"{feature}: {importance}")

# Optionally, select the most important features based on a threshold
threshold = 0.01
important_features = [feature for feature, importance in zip(feature_columns, feature_importances) if importance >= threshold]
print("Important Features:", important_features)

Feature Importances:
departure_airport_avg_altitude: 0.0
departure_airport_avg_bearing: 0.014632612571395651
departure_airport_avg_knots: 0.009129679230682138
departure_airport_avg_temperature: 0.018308213260391063
departure_airport_min_altitude: 0.0
departure_airport_min_bearing: 0.008913779802424453
departure_airport_min_knots: 0.0
departure_airport_min_temperature: 0.023004848269726666
departure_airport_max_altitude: 0.0
departure_airport_max_bearing: 0.0
departure_airport_max_knots: 0.008181979464980732
departure_airport_max_temperature: 0.004083018514116807
arrival_airport_avg_altitude: 0.0
arrival_airport_avg_bearing: 0.0038677992100691998
arrival_airport_avg_knots: 0.008264721418263976
arrival_airport_avg_temperature: 0.02530843404603288
arrival_airport_min_altitude: 0.0
arrival_airport_min_bearing: 0.0005166539383457392
arrival_airport_min_knots: 0.0
arrival_airport_min_temperature: 0.005740648292316566
arrival_airport_max_altitude: 0.0
arrival_airport_max_bearing: 0.0
arrival_

## 3.2 Important freature selection

In [45]:
# Use important features for further analysis or model training
selected_data = sampled_df.select(important_features + [target_column])

In [46]:
######## Process the data & do train-test split #######

# Define feature columns
feature_columns = selected_data.columns[:-1]  # Assuming the last column is the target

# Define the target column
target_column = 'Delay'

# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
#sampled_df_model = assembler.transform(sampled_df)
# Define a simple pipeline
pipeline = Pipeline(stages=[assembler])

# Fit the pipeline to the data
pipelineModel = pipeline.fit(sampled_df)

# Transform the data with the pipeline
sampled_df_transformed = pipelineModel.transform(sampled_df)
# Select features and target column for modeling
sampled_df_model = sampled_df_transformed.select("features", target_column)

# Split data into training and test sets
train_df, test_df = sampled_df_model.randomSplit([train_split, test_split])

In [47]:
######## train rf again ########
# Train a RandomForestClassifier model
rf = RandomForestClassifier(labelCol=target_column, featuresCol="features")
rf_model = rf.fit(train_df)

######## get the accuracy & AUC for model trained on important features ########
# Evaluate the model on the training set by accuracy
train_predictions = rf_model.transform(train_df)
evaluator_acc = MulticlassClassificationEvaluator(labelCol=target_column, predictionCol="prediction", metricName="accuracy")
train_accuracy = evaluator_acc.evaluate(train_predictions)
print(f"Training Accuracy: {train_accuracy}")

# Evaluate the model on the test set by accuracy
test_predictions = rf_model.transform(test_df)
test_accuracy = evaluator_acc.evaluate(test_predictions)
print(f"Test Accuracy: {test_accuracy}")

# Calculate the area under the ROC curve (AUC)
evaluator_auc = BinaryClassificationEvaluator(labelCol=target_column, rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc = evaluator_auc.evaluate(test_predictions)
print(f"Area Under ROC: {auc}")

Training Accuracy: 1.0
Test Accuracy: 1.0
Area Under ROC: 1.0


After select those most important features, we got the train acc, test acc, and test auc all being 1. It is good, but it become 1 might not just because of the selection of the important features. It **might also caused by the new train-test split** after we select those important features.

## 3.3 Hyperparameter Tuning

### 3.3.1 After important feature selection

In [54]:
# Define a grid of hyperparameters to search
param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 30]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .build()

# Set up the cross-validation
evaluator = MulticlassClassificationEvaluator(labelCol=target_column, predictionCol="prediction", metricName="accuracy")
cross_val = CrossValidator(estimator=rf,
                           estimatorParamMaps=param_grid,
                           evaluator=evaluator,
                           numFolds=5)

# Run cross-validation to find the best hyperparameters
cv_model = cross_val.fit(train_df)

# Make predictions on the test data using the best model
best_model = cv_model.bestModel
predictions = best_model.transform(test_df)

24/04/09 19:25:30 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.
24/04/09 19:25:30 WARN org.apache.spark.sql.execution.CacheManager: Asked to cache already cached data.


In [57]:
# Evaluate the best model's performance
accuracy = evaluator.evaluate(predictions)
print(f"Best Model Accuracy: {accuracy}")

# Optional: You can also view the best model's parameters
best_numTrees = best_model._java_obj.getNumTrees()
best_maxDepth = best_model._java_obj.getMaxDepth()
print(f"Best Num Trees: {best_numTrees}, Best Max Depth: {best_maxDepth}")

Best Model Accuracy: 1.0
Best Num Trees: 10, Best Max Depth: 5


### 3.3.2 With full features

In [59]:
# Define feature columns
feature_columns = sampled_df.columns[:-1]  # Assuming the last column is the target

# Define the target column
target_column = 'Delay'

# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Define a simple pipeline
pipeline = Pipeline(stages=[assembler])

# Fit the pipeline to the data
pipelineModel = pipeline.fit(sampled_df)

# Transform the data with the pipeline
sampled_df_transformed = pipelineModel.transform(sampled_df)
# Select features and target column for modeling
sampled_df_model = sampled_df_transformed.select("features", target_column)

# Split data into training and test sets
train_df, test_df = sampled_df_model.randomSplit([train_split, test_split])

In [60]:
# Define a grid of hyperparameters to search
param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 30]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .build()

# Set up the cross-validation
evaluator = MulticlassClassificationEvaluator(labelCol=target_column, predictionCol="prediction", metricName="accuracy")
cross_val = CrossValidator(estimator=rf,
                           estimatorParamMaps=param_grid,
                           evaluator=evaluator,
                           numFolds=5)

# Run cross-validation to find the best hyperparameters
cv_model = cross_val.fit(train_df)

# Make predictions on the test data using the best model
best_model = cv_model.bestModel
predictions = best_model.transform(test_df)

In [61]:
# Evaluate the best model's performance
accuracy = evaluator.evaluate(predictions)
print(f"Best Model Accuracy: {accuracy}")

# Optional: You can also view the best model's parameters
best_numTrees = best_model._java_obj.getNumTrees()
best_maxDepth = best_model._java_obj.getMaxDepth()
print(f"Best Num Trees: {best_numTrees}, Best Max Depth: {best_maxDepth}")

Best Model Accuracy: 0.9940029985007496
Best Num Trees: 20, Best Max Depth: 10


# 4. Gradient Boosted Decision Trees

In [62]:
# Initialize Gradient Boosted Decision Trees Classifier
gbt = GBTClassifier(labelCol=target_column, featuresCol="features")

# Train the model
gbt_model = gbt.fit(train_df)

In [63]:
# Evaluate the model on the training set by accuracy
train_predictions = gbt_model.transform(train_df)
evaluator_acc = MulticlassClassificationEvaluator(labelCol=target_column, predictionCol="prediction", metricName="accuracy")
train_accuracy = evaluator_acc.evaluate(train_predictions)
print(f"Training Accuracy: {train_accuracy}")

# Evaluate the model on the test set by accuracy
test_predictions = gbt_model.transform(test_df)
test_accuracy = evaluator_acc.evaluate(test_predictions)
print(f"Test Accuracy: {test_accuracy}")

# Calculate the area under the ROC curve (AUC)
evaluator_auc = BinaryClassificationEvaluator(labelCol=target_column, rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc = evaluator_auc.evaluate(test_predictions)
print(f"Area Under ROC: {auc}")

Training Accuracy: 1.0
Test Accuracy: 1.0
Area Under ROC: 1.0


## 4.1 Hyperparameter Tuning

In [65]:
# Define a grid of hyperparameters to search
param_grid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [5, 10]) \
    .addGrid(gbt.maxIter, [10, 20]) \
    .build()

# Set up the cross-validation
evaluator = MulticlassClassificationEvaluator(labelCol=target_column, predictionCol="prediction", metricName="accuracy")
cross_val = CrossValidator(estimator=gbt,
                           estimatorParamMaps=param_grid,
                           evaluator=evaluator,
                           numFolds=5)

# Run cross-validation to find the best hyperparameters
cv_model = cross_val.fit(train_df)

# Make predictions on the test data using the best model
best_model = cv_model.bestModel
predictions = best_model.transform(test_df)

In [66]:
# Evaluate the best model's performance
accuracy = evaluator.evaluate(predictions)
print(f"Best Model Accuracy: {accuracy}")

# Optional: You can also view the best model's parameters
best_maxDepth = best_model._java_obj.getMaxDepth()
best_maxIter = best_model._java_obj.getMaxIter()
print(f"Best Max Depth: {best_maxDepth}, Best Max Iter: {best_maxIter}")

Best Model Accuracy: 1.0
Best Max Depth: 5, Best Max Iter: 10


## 4.2 Feature importantces

In [70]:
# Get feature importances
feature_importances = gbt_model.featureImportances

# Print feature importances
print("Feature Importances:")
for feature, importance in zip(feature_columns, feature_importances):
    print(f"{feature}: {importance}")

# Optionally, select the most important features based on a threshold
threshold = 0.001
important_features = [feature for feature, importance in zip(feature_columns, feature_importances) if importance >= threshold]
print("Important Features:", important_features)

Feature Importances:
departure_airport_avg_altitude: 0.0
departure_airport_avg_bearing: 3.3635742105411793e-18
departure_airport_avg_knots: 0.0
departure_airport_avg_temperature: 0.0
departure_airport_min_altitude: 0.0
departure_airport_min_bearing: 3.463430319916567e-17
departure_airport_min_knots: 0.0
departure_airport_min_temperature: 1.0695114872580156e-15
departure_airport_max_altitude: 0.0
departure_airport_max_bearing: 0.0
departure_airport_max_knots: 0.0
departure_airport_max_temperature: 8.072578105298831e-17
arrival_airport_avg_altitude: 0.0
arrival_airport_avg_bearing: 0.0
arrival_airport_avg_knots: 2.6406159786629843e-15
arrival_airport_avg_temperature: 0.0
arrival_airport_min_altitude: 0.0
arrival_airport_min_bearing: 3.602046366483853e-16
arrival_airport_min_knots: 0.0
arrival_airport_min_temperature: 7.995215898456383e-15
arrival_airport_max_altitude: 0.0
arrival_airport_max_bearing: 0.0
arrival_airport_max_knots: 5.789552109894005e-16
arrival_airport_max_temperature: 3.

The feature importance shows that it cannot get the important feature than label itself for Gradient Boosted Decision Trees model.

# 5 Multilayer perceptron (MLP) 

In [30]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


layers = [114, 64, 32, 2]

mlp = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)



In [32]:
train_df = train_df.withColumnRenamed("Delay", "label")
test_df = test_df.withColumnRenamed("Delay", "label")

model = mlp.fit(train_df)


In [34]:
result = model.transform(test_df)


predictionAndLabels = result.select("prediction", "label")


evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

accuracy = evaluator.evaluate(predictionAndLabels)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.9554140127388535


In [40]:
# Calculate the area under the ROC curve (AUC)
evaluator_auc = BinaryClassificationEvaluator(labelCol='label', rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc = evaluator_auc.evaluate(result)
print(f"Area Under ROC: {auc}")

Area Under ROC: 0.5


## 5.1 Hyperparameter Tuning

In [68]:
num_features = len(train_df.select("features").first()[0])
print(f"Number of features: {num_features}")


Number of features: 114


In [70]:
paramGrid = ParamGridBuilder() \
    .addGrid(mlp.maxIter, [100, 200]) \
    .addGrid(mlp.blockSize, [128, 256]) \
    .addGrid(mlp.layers, [[114, 5, 4, 2], [114, 6, 3, 2]]) \
    .build()

evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

crossval = CrossValidator(estimator=mlp,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)
cvModel = crossval.fit(train_df)

bestModel = cvModel.bestModel
print("Best model parameters:")

params = bestModel.extractParamMap()
for param, value in params.items():
    print(f"{param.name}: {value}")



Best model parameters:
featuresCol: features
labelCol: label
predictionCol: prediction
probabilityCol: probability
rawPredictionCol: rawPrediction


In [72]:
params_and_metrics = list(zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics))

best_params = params_and_metrics[0][0]
best_metric = params_and_metrics[0][1]
for params, metric in params_and_metrics:
    if metric > best_metric:
        best_params = params
        best_metric = metric

print("Best model parameters and metric:")
print(best_params)
print(f"Best metric (accuracy): {best_metric}")


Best model parameters and metric:
{Param(parent='MultilayerPerceptronClassifier_05f02a343e92', name='maxIter', doc='max number of iterations (>= 0).'): 100, Param(parent='MultilayerPerceptronClassifier_05f02a343e92', name='blockSize', doc='Block size for stacking input data in matrices. Data is stacked within partitions. If block size is more than remaining data in a partition then it is adjusted to the size of this data. Recommended size is between 10 and 1000, default is 128.'): 128, Param(parent='MultilayerPerceptronClassifier_05f02a343e92', name='layers', doc='Sizes of layers from input layer to output layer E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 neurons and output layer of 10 neurons.'): [114, 5, 4, 2]}
Best metric (accuracy): 0.9609760183726678


# 6 Linear Support Vector Machine

In [48]:
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler

In [49]:
lsvc = LinearSVC(maxIter=10, regParam=0.1)

lsvcModel = lsvc.fit(train_df)

In [50]:
predictions = lsvcModel.transform(test_df)

evaluator = BinaryClassificationEvaluator()
accuracy = evaluator.evaluate(predictions)

print("Test set accuracy = " + str(accuracy))


Test set accuracy = 1.0


In [53]:
# Evaluate the model on the training set by accuracy
train_predictions = lsvcModel.transform(train_df)
# by accuracy
evaluator_acc = BinaryClassificationEvaluator()
accuracy = evaluator_acc.evaluate(train_predictions)
print(f"Training Accuracy: {accuracy}")

Training Accuracy: 1.0


In [51]:
# Calculate the area under the ROC curve (AUC)
auc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
print(f"Area Under ROC: {auc}")

Area Under ROC: 1.0


## 6.1 Hyperparameter Tuning

In [62]:
paramGrid = ParamGridBuilder() \
    .addGrid(lsvc.maxIter, [10, 100, 1000]) \
    .addGrid(lsvc.regParam, [0.01, 0.1, 0.5]) \
    .build()


In [None]:
evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")

crossval = CrossValidator(estimator=lsvc,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3) 

cvModel = crossval.fit(train_df)

bestModel = cvModel.bestModel
predictions = bestModel.transform(test_df)

test_auc = evaluator.evaluate(predictions)
print(f"Test set AUC after hyperparameter tuning = {test_auc}")


Test set AUC after hyperparameter tuning = 1.0


## 'LinearSVCModel' has no 'featureImportances'

# 7 OneVsRes

In [54]:
from pyspark.ml.classification import OneVsRest, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [55]:
classifier = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

ovr = OneVsRest(classifier=classifier)

ovrModel = ovr.fit(train_df)


In [56]:
predictions = ovrModel.transform(test_df)

evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Test set accuracy = {accuracy}")


Test set accuracy = 1.0


In [57]:
train_predictions = ovrModel.transform(train_df)

train_evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
train_accuracy = train_evaluator.evaluate(train_predictions)

print(f"Training set accuracy = {train_accuracy}")


Training set accuracy = 1.0


In [61]:
auc_evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", metricName="areaUnderROC")
test_auc = auc_evaluator.evaluate(predictions)

print(f"Test set AUC = {test_auc}")

Test set AUC = 1.0


##  7.1 Hyperparameter Tuning

In [66]:
paramGrid = ParamGridBuilder() \
    .addGrid(classifier.maxIter, [10, 50]) \
    .addGrid(classifier.regParam, [0.01, 0.1]) \
    .build()


In [67]:
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

crossval = CrossValidator(estimator=ovr,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3) 

cvModel = crossval.fit(train_df)

bestModel = cvModel.bestModel
predictions = bestModel.transform(test_df)

test_accuracy = evaluator.evaluate(predictions)
print(f"Test set accuracy after hyperparameter tuning = {test_accuracy}")


Test set accuracy after hyperparameter tuning = 1.0
