In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel

# Initialize SparkContext
spark = SparkSession.builder.appName("RDDDecisionTreeRegression").getOrCreate()
sc = spark.sparkContext

# Load CSV files as RDDs
train_rdd_raw = sc.textFile("/kaggle/input/vuiver/train.csv")
test_rdd_raw = sc.textFile("/kaggle/input/vuiver/test.csv")

# Define column indices based on train.csv structure

feature_indices = [1, 4, 5, 6, 7, 8]  # vendor_id, passenger_count, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude
label_index = 10  # trip_duration

# Function to parse train.csv into LabeledPoint
def parse_train_line(line):
    values = line.split(',')
    try:
        # Extract numeric features and label
        features = [float(values[i]) for i in feature_indices]
        label = float(values[label_index])
        return LabeledPoint(label, features)
    except (ValueError, IndexError) as e:
        print(f"Skipping invalid train row: {line} due to {e}")
        return None

# Function to parse test.csv into features only (no label)
def parse_test_line(line):
    values = line.split(',')
    try:
        # Extract numeric features (adjust indices for test.csv)
        test_feature_indices = [1, 3, 4, 5, 6, 7]  # vendor_id, passenger_count, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude
        features = [float(values[i]) for i in test_feature_indices]
        return features  # Return features only, no LabeledPoint since no label
    except (ValueError, IndexError) as e:
        print(f"Skipping invalid test row: {line} due to {e}")
        return None

# Skip header and parse data
header_train = train_rdd_raw.first()
header_test = test_rdd_raw.first()
train_rdd = train_rdd_raw.filter(lambda line: line != header_train).map(parse_train_line).filter(lambda x: x is not None)
test_rdd = test_rdd_raw.filter(lambda line: line != header_test).map(parse_test_line).filter(lambda x: x is not None)

# Split train_rdd into training and validation sets (since test.csv has no labels)
(trainingData, validationData) = train_rdd.randomSplit([0.7, 0.3])

# Train a DecisionTree model
model = DecisionTree.trainRegressor(
    trainingData,
    categoricalFeaturesInfo={},  # Assume all features are continuous
    impurity='variance',
    maxDepth=20,
    maxBins=32
)

# Evaluate model on validation set (from train.csv split)
predictions = model.predict(validationData.map(lambda x: x.features))
labelsAndPredictions = validationData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() / float(validationData.count())
testRMSE = testMSE ** 0.5
print('Validation Mean Squared Error = ' + str(testMSE))
print('Validation Root Mean Squared Error = ' + str(testRMSE))

# Predict on test.csv (no labels, just predictions)
test_predictions = model.predict(test_rdd)
test_predictions.take(5)  # View first 5 predictions
print("First 5 predictions for test.csv:")
for pred in test_predictions.take(5):
    print(pred)

# Print the learned model
print('Learned regression tree model:')
print(model.toDebugString())

# Save and load model
model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeRegressionModel")

# Stop Spark session
spark.stop()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/10 17:23:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/10 17:23:40 WARN DAGScheduler: Broadcasting large task binary with size 1475.1 KiB            
25/04/10 17:23:42 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB               
25/04/10 17:23:44 WARN DAGScheduler: Broadcasting large task binary with size 3.7 MiB               
25/04/10 17:23:46 WARN DAGScheduler: Broadcasting large task binary with size 5.6 MiB               
25/04/10 17:23:48 WARN DAGScheduler: Broadcasting large task binary with size 1108.3 KiB(5 + 1) / 6]
25/04/10 17:23:50 WARN DAGScheduler: Broadcasting large task binary with size 8.2 MiB               
25/04/10 17:23:53 WARN DAGScheduler: Broadcasting large task binary with size 1472.7 KiB(5 + 1) / 6]
25/04/10 17:23:55 WA

Validation Mean Squared Error = 26930410.005425915
Validation Root Mean Squared Error = 5189.451802013958


25/04/10 17:24:39 WARN DAGScheduler: Broadcasting large task binary with size 18.3 MiB
                                                                                                    

First 5 predictions for test.csv:


25/04/10 17:24:44 WARN DAGScheduler: Broadcasting large task binary with size 18.3 MiB
                                                                                                    

831.7209302325581
656.0
565.1875
1120.7688524590164
388.34133653461384
Learned regression tree model:
