In [0]:
%pyspark

# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_extract
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("NBA Analysis with Spark") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .config("spark.sql.catalogImplementation", "in-memory") \
    .getOrCreate()

# Define HDFS path
hdfs_path = "hdfs://namenode:9000/user/test/output/cleanedData"

# Load data into DataFrame
df = spark.read.csv(hdfs_path, header=False, inferSchema=False)

# Assign column names to match the schema
column_names = [
    "event_id", "event_num", "game_id", "home_description", "time", "period",
    "player1_id", "player1_name", "player1_team_abbr", "player1_team_city",
    "player1_team_id", "player1_team_name", "player2_id", "player2_name",
    "player2_team_abbr", "player2_team_city", "player2_team_id", "player2_team_name",
    "player3_id", "player3_name", "player3_team_abbr", "player3_team_city",
    "player3_team_id", "player3_team_name", "score", "score_margin", "visitor_description"
]
df = df.toDF(*column_names)

In [1]:
%pyspark
# Extract home and visitor scores from 'score' column in 'X - Y' format
df = df.withColumn("home_score", regexp_extract(col("score"), r"(\d+)\s*-\s*\d+", 1).cast("int")) \
       .withColumn("visitor_score", regexp_extract(col("score"), r"\d+\s*-\s*(\d+)", 1).cast("int"))
       
df = df.withColumn("player_points", col("home_score") + col("visitor_score"))

# Feature Engineering
df = df.withColumn("score_margin", col("home_score") - col("visitor_score"))

# Assemble the features into a feature vector
assembler = VectorAssembler(
    inputCols=["home_score", "visitor_score", "score_margin"],  # Add any additional features here
    outputCol="features"
)

# Train-Test Split
train_data, test_data = df.randomSplit([0.8, 0.2], seed=1234)

# Initialize a Linear Regression model (for predicting player points)
lr = LinearRegression(featuresCol="features", labelCol="player_points")

# Create a pipeline with feature assembler and regression model
pipeline = Pipeline(stages=[assembler, lr])

# Fit the model on the training data
model = pipeline.fit(train_data)

In [2]:
%pyspark
# Make predictions on the test data
predictions = model.transform(test_data)

# Evaluate the model performance using RMSE (Root Mean Squared Error)
evaluator = RegressionEvaluator(labelCol="player_points", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)

print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

In [3]:
%pyspark
# Show predictions
predictions.select("player1_name", "prediction", "player_points").show(20, truncate=False)