In [26]:
# Setup Environment

import os
import logging
import xgboost
from pyspark.sql.functions import (col, count, row_number, abs, unix_timestamp, mean, 
                                   when, lit, min as spark_min, max as spark_max , 
                                   row_number, mean, countDistinct, last, first)
import configparser
from pyspark.sql import SparkSession
from src.data_preprocessing.data_prep1.sql_queries import sql_queries
from src.data_preprocessing.data_prep1.data_loader import load_data_from_postgresql, reload_parquet_files
from pyspark.sql.window import Window
from pyspark.sql import DataFrame, Window
from src.data_preprocessing.data_prep1.data_utils import (save_parquet, gather_statistics, 
                initialize_environment, load_config, initialize_logging, initialize_spark, 
                drop_duplicates_with_tolerance, identify_and_impute_outliers, 
                identify_and_remove_outliers, identify_missing_and_outliers)
# Set global references to None
spark = None
master_results_df = None
sectional_results = None
results = None

In [27]:
spark, jdbc_url, jdbc_properties, queries, parquet_dir, log_file = initialize_environment()

2024-12-16 14:02:47,852 - INFO - Environment setup initialized.
2024-12-16 14:02:47,856 - INFO - Spark session created successfully.


In [28]:
results = spark.read.parquet(os.path.join(parquet_dir, "results.parquet"))

In [29]:
results.printSchema()

root
 |-- course_cd: string (nullable = true)
 |-- race_date: date (nullable = true)
 |-- race_number: integer (nullable = true)
 |-- saddle_cloth_number: string (nullable = true)
 |-- official_fin: integer (nullable = true)
 |-- purse: integer (nullable = true)
 |-- wps_pool: decimal(10,2) (nullable = true)
 |-- weight: decimal(10,2) (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- sex: string (nullable = true)
 |-- start_position: long (nullable = true)
 |-- equip: string (nullable = true)
 |-- claimprice: double (nullable = true)
 |-- surface: string (nullable = true)
 |-- surface_type_description: string (nullable = true)
 |-- trk_cond: string (nullable = true)
 |-- trk_cond_desc: string (nullable = true)
 |-- weather: string (nullable = true)
 |-- distance: decimal(10,2) (nullable = true)
 |-- dist_unit: string (nullable = true)
 |-- power: decimal(10,2) (nullable = true)
 |-- med: string (nullable = true)
 |-- morn_odds: decimal(10,2) (nullable = true)
 |-- avgsp

In [30]:
# results = results.drop("course_cd")

# Data Analysis

Before beginning here, go to /src/data_preprocessing/data_prep2/main_data_prep2.py and run data_check and cardinality reports. 

### Convert decimal columns to float

In [64]:
#distinct_distances = results.select("distance", "dist_unit").distinct()
#distinct_distances.show(1000)

In [66]:
results = results.withColumn("distance", col("distance").cast("double"))
results = results.withColumn("wps_pool", col("wps_pool").cast("double"))
results = results.withColumn("weight", col("weight").cast("double"))
results = results.withColumn("power", col("power").cast("double"))
results = results.withColumn("morn_odds", col("morn_odds").cast("double"))
results = results.withColumn("all_earnings", col("all_earnings").cast("double"))
results = results.withColumn("cond_earnings", col("cond_earnings").cast("double"))


### date_of_birth: Impute with Global median

In [31]:
from pyspark.sql.functions import col, lit, count, expr
from pyspark.sql.window import Window

# Convert date_of_birth to a numeric timestamp for median calculation
results = results.withColumn("date_of_birth_ts", col("date_of_birth").cast("timestamp").cast("long"))

# Calculate the median of date_of_birth
median_window = Window.orderBy("date_of_birth_ts")
row_count = results.filter(col("date_of_birth_ts").isNotNull()).count()

if row_count % 2 == 0:  # Even number of rows
    median_row_1 = row_count // 2
    median_row_2 = median_row_1 + 1
    median_ts = results.filter(col("date_of_birth_ts").isNotNull()) \
        .select("date_of_birth_ts") \
        .withColumn("row_num", expr("row_number() over (ORDER BY date_of_birth_ts)")) \
        .filter((col("row_num") == median_row_1) | (col("row_num") == median_row_2)) \
        .groupBy().agg(expr("avg(date_of_birth_ts)").alias("median_ts")) \
        .collect()[0]["median_ts"]
else:  # Odd number of rows
    median_row = (row_count + 1) // 2
    median_ts = results.filter(col("date_of_birth_ts").isNotNull()) \
        .select("date_of_birth_ts") \
        .withColumn("row_num", expr("row_number() over (ORDER BY date_of_birth_ts)")) \
        .filter(col("row_num") == median_row) \
        .collect()[0]["date_of_birth_ts"]

# Convert median timestamp back to date
median_date = lit(expr(f"CAST(FROM_UNIXTIME({median_ts}) AS DATE)"))

# Fill missing values with the global median date
results = results.withColumn(
    "date_of_birth",
    when(col("date_of_birth").isNull(), median_date).otherwise(col("date_of_birth"))
).drop("date_of_birth_ts")

In [32]:
# Show the updated DataFrame
results.filter(col("date_of_birth").isNull()).count()

0

## Convert DOB to AGE_AT_RACE_DAY

In [33]:
from pyspark.sql.functions import col, datediff, expr

# Ensure both date_of_birth and race_date are in date format
results = results.withColumn("date_of_birth", col("date_of_birth").cast("date"))
results = results.withColumn("race_date", col("race_date").cast("date"))

# Calculate age in days, then convert to years
results = results.withColumn(
    "age_at_race_day",
    datediff(col("race_date"), col("date_of_birth")) / 365.25  # Convert days to years
)

# Show the updated DataFrame
results.select("date_of_birth", "race_date", "age_at_race_day").show(5)

+-------------+----------+------------------+
|date_of_birth| race_date|   age_at_race_day|
+-------------+----------+------------------+
|   2019-05-03|2022-04-23| 2.973305954825462|
|   2019-05-06|2022-04-23| 2.965092402464066|
|   2019-05-06|2022-04-23| 2.965092402464066|
|   2019-04-27|2022-04-23|2.9897330595482545|
|   2019-04-11|2022-04-23| 3.033538672142368|
+-------------+----------+------------------+
only showing top 5 rows



### Encoding Weather

In [34]:
# Remember to pay attention to case
results = results.fillna({"weather": "Clear"})
results.filter(col("weather").isNull()).count()

0

In [35]:
results.select("weather").distinct().count()
# Count the occurrences of each distinct value in the "weather" column
distinct_value_counts = results.groupBy("weather").count()

# Show the result
distinct_value_counts.show()

+-------+------+
|weather| count|
+-------+------+
|  Foggy|  1318|
| Cloudy|288456|
|Showery| 24189|
|  Clear|428636|
|  Rainy| 17382|
|Snowing|  1768|
|   Hazy|  9083|
+-------+------+



### wps_pool: Imputing with mean

In [36]:
from pyspark.sql.functions import col, mean, when

# Calculate the mean of the 'wps_pool' column, excluding nulls
mean_value = results.select(mean(col("wps_pool")).alias("mean_wps_pool")).collect()[0]["mean_wps_pool"]

# Replace null values in 'wps_pool' with the calculated mean
results = results.withColumn(
    "wps_pool",
    when(col("wps_pool").isNull(), mean_value).otherwise(col("wps_pool"))
)

# Show the updated DataFrame
results.filter(col("wps_pool").isNull()).count()

0

### equip: Conversion and Imputation

In [37]:
results = results.fillna({"equip": "No_Equip"})
results.filter(col("equip").isNull()).count()

0

### trk_cond: Impute

In [38]:
cols = ["trk_cond", "trk_cond_desc"] 
results.select(cols).distinct().count()

# Count the occurrences of each distinct value in the "weather" column
distinct_value_counts = results.groupBy(cols).count()

# Show the result
distinct_value_counts.show()


+--------+-------------+------+
|trk_cond|trk_cond_desc| count|
+--------+-------------+------+
|      SL|         slow|   120|
|      SY|       sloppy| 36615|
|    null|         null|    76|
|      FM|         firm|114254|
|      SF|         soft|   823|
|      YL|     yielding|  1382|
|      GD|         good| 44761|
|      FT|         fast|542039|
|      FZ|       frozen|    77|
|      MY|        muddy| 26580|
|      WF|     wet fast|  4008|
|      HY|        heavy|    97|
+--------+-------------+------+



In [39]:
cols = ["trk_cond", "trk_cond_desc"]

# Fill missing values with "MISSING" for the specified columns
results = results.fillna({col: "MISSING" for col in cols})

# Verify no null values remain
missing_count = results.filter(
    (col("trk_cond").isNull()) | (col("trk_cond_desc").isNull())
).count()

print(f"Number of missing values: {missing_count}")

Number of missing values: 0


In [42]:
results = results.drop("trk_cond_desc", "saddle_cloth_number")

### Encoding Sex

Horse Sex 	Code	Description
	C	Colt
	F	Filly
	G	Gelding
	H	Horse
	M	Mare
	R	Ridgling
	B	Spayed Mare

In [43]:
# "equip", "surface", "surface_type_description", "trk_cond", "trk_cond_desc", "weather", "dist_unit", "race_type"]

cols = ["sex"] 

distinct_values = results.select(*cols).distinct()
distinct_values.show(50)


+---+
|sex|
+---+
|  F|
|  B|
|  M|
|  C|
|  R|
|  G|
|  H|
+---+



In [44]:
from pyspark.sql.functions import trim, col
results = results.withColumn("dist_unit", trim(col("dist_unit")))

In [45]:
results = results.replace("", "MISSING", subset=["med", "turf_mud_mark"])

In [46]:
results.filter(col("sex").isNull()).count()

0

In [56]:
results = results.drop("surface_type_description", "avgcls")
results = results.drop("trk_cond_desc")

In [48]:
results = results.drop("horse_name")

In [49]:
# results = results.drop("saddle_cloth_number")

In [50]:
cols = ["race_type"] 

distinct_values = results.select(*cols).distinct()
distinct_values.show(50)


+--------------------+
|           race_type|
+--------------------+
|     Maiden Claiming|
|Waiver Maiden Cla...|
|Allowance Optiona...|
|            Claiming|
|Optional Claiming...|
|      Starter Stakes|
|    Starter Handicap|
|   Optional Claiming|
|Starter Optional ...|
|   Starter Allowance|
|           Allowance|
|              Stakes|
|            Handicap|
|Maiden Special We...|
|Maiden Optional C...|
|     Waiver Claiming|
|       Maiden Stakes|
|               Trial|
|     Claiming Stakes|
+--------------------+



In [25]:
results.printSchema()

root
 |-- course_cd: string (nullable = true)
 |-- race_date: date (nullable = true)
 |-- race_number: integer (nullable = true)
 |-- saddle_cloth_number: string (nullable = true)
 |-- official_fin: integer (nullable = true)
 |-- purse: integer (nullable = true)
 |-- wps_pool: decimal(14,6) (nullable = true)
 |-- weight: decimal(10,2) (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- sex: string (nullable = true)
 |-- start_position: long (nullable = true)
 |-- equip: string (nullable = false)
 |-- claimprice: double (nullable = true)
 |-- surface: string (nullable = true)
 |-- trk_cond: string (nullable = false)
 |-- weather: string (nullable = false)
 |-- distance: decimal(10,2) (nullable = true)
 |-- dist_unit: string (nullable = true)
 |-- power: decimal(10,2) (nullable = true)
 |-- med: string (nullable = true)
 |-- morn_odds: decimal(10,2) (nullable = true)
 |-- avgspd: double (nullable = true)
 |-- avgcls: double (nullable = true)
 |-- jock_key: string (nullable 

# OHE and Prep for XGBoost Trainining in Spark

In [54]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
import datetime


### Create Label from Official_fin

In [52]:
# Convert official_fin to binary label for top-4 finish
results = results.withColumn("label", when(col("official_fin") <= 4, 1).otherwise(0))

In [55]:
# Took out course_cd to see if it would help identify other predictive features.
# Categorical columns equip, surface, trk_cond, weather, dist_unit, race_type 
categorical_cols = ["equip", "surface", "trk_cond", "weather", "dist_unit", "race_type", "sex" , "med", "stk_clm_md", "turf_mud_mark"]
indexers = [StringIndexer(inputCol=c, outputCol=c+"_index", handleInvalid="keep") for c in categorical_cols]
encoders = [OneHotEncoder(inputCols=[c+"_index"], outputCols=[c+"_ohe"]) for c in categorical_cols]


In [None]:
#for c in categorical_cols:
#    distinct_values = results.select(c).distinct().collect()
#    print(c, [row[c] for row in distinct_values])

In [57]:

results = results.drop("date_of_birth", "official_fin")

In [58]:
results.printSchema()

root
 |-- course_cd: string (nullable = true)
 |-- race_date: date (nullable = true)
 |-- race_number: integer (nullable = true)
 |-- purse: integer (nullable = true)
 |-- wps_pool: decimal(14,6) (nullable = true)
 |-- weight: decimal(10,2) (nullable = true)
 |-- sex: string (nullable = true)
 |-- start_position: long (nullable = true)
 |-- equip: string (nullable = false)
 |-- claimprice: double (nullable = true)
 |-- surface: string (nullable = true)
 |-- trk_cond: string (nullable = false)
 |-- weather: string (nullable = false)
 |-- distance: decimal(10,2) (nullable = true)
 |-- dist_unit: string (nullable = true)
 |-- power: decimal(10,2) (nullable = true)
 |-- med: string (nullable = true)
 |-- morn_odds: decimal(10,2) (nullable = true)
 |-- avgspd: double (nullable = true)
 |-- jock_key: string (nullable = true)
 |-- train_key: string (nullable = true)
 |-- race_type: string (nullable = true)
 |-- class_rating: integer (nullable = true)
 |-- net_sentiment: integer (nullable = tr

In [59]:
jock_indexer = StringIndexer(inputCol="jock_key", outputCol="jock_key_index", handleInvalid="keep")
train_indexer = StringIndexer(inputCol="train_key", outputCol="train_key_index", handleInvalid="keep")

In [73]:
# Numeric columns
# Removing "race_number" 
numeric_cols = ["morn_odds", "age_at_race_day",  "purse", "weight", "start_position", 
                "claimprice", "power", "avgspd", "class_rating", "net_sentiment","weight", 
                "distance", "power", "all_earnings", "cond_earnings", "avg_spd_sd", 
                "ave_cl_sd", "hi_spd_sd", "pstyerl", "all_starts", 
               "all_win", "all_place", "all_show", "all_fourth", "cond_starts", 
                "cond_win", "cond_place", "cond_show", "cond_fourth"]
# Add later to numeric cols after normalization: "jock_key_index", "train_key_index", 

### Spark Pipeline

In [74]:
# Create a pipeline to transform data
preprocessing_stages = [jock_indexer, train_indexer] + indexers + encoders
pipeline = Pipeline(stages=preprocessing_stages)
model = pipeline.fit(results)
df_transformed = model.transform(results)

In [75]:
ohe_cols = [c+"_ohe" for c in categorical_cols]

In [76]:
assembler = VectorAssembler(inputCols=numeric_cols + ohe_cols, outputCol="raw_features")
df_assembled = assembler.transform(df_transformed)


### Normalize Numeric Values

1.	Assemble Numeric Features Into a Vector:
First, use a VectorAssembler to combine all numeric columns into a single feature vector:

In [77]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

# numeric_cols defined as above
numeric_assembler = VectorAssembler(
    inputCols=numeric_cols,
    outputCol="numeric_vector"
)

df_with_numeric_vector = numeric_assembler.transform(df_assembled)  # df_assembled is your DataFrame with numeric_cols


	2.	Apply StandardScaler:
Using StandardScaler with withMean=True and withStd=True ensures zero mean and unit variance scaling.

In [78]:
scaler = StandardScaler(
    inputCol="numeric_vector",
    outputCol="numeric_scaled",
    withMean=True,  # center the data with mean
    withStd=True    # scale to unit variance
)

scaler_model = scaler.fit(df_with_numeric_vector)
df_scaled = scaler_model.transform(df_with_numeric_vector)

                                                                                

	3.	Replace Original Numeric Features with Scaled Vector:
Now df_scaled has a new column numeric_scaled that contains the scaled versions of your numeric features. You can drop the original numeric columns if you no longer need them, or keep them for reference.
When building your final features vector for the model, include numeric_scaled vector instead of individual numeric columns. For example:

In [79]:
# Suppose you have categorical OHE columns in ohe_cols
# Combine numeric_scaled with ohe_cols
final_assembler = VectorAssembler(
    inputCols=["numeric_scaled"] + ohe_cols,
    outputCol="features"
)
df_final = final_assembler.transform(df_scaled)

# Now df_final contains 'features' that has normalized numeric features plus OHE columns.

In [80]:
scaler = StandardScaler(inputCol="raw_features", outputCol="features", withMean=True, withStd=True)
scaler_model = scaler.fit(df_assembled)
df_final = scaler_model.transform(df_assembled)

                                                                                

In [85]:
df_final.printSchema()

root
 |-- race_date: date (nullable = true)
 |-- label: integer (nullable = false)
 |-- equip_ohe: vector (nullable = true)
 |-- surface_ohe: vector (nullable = true)
 |-- trk_cond_ohe: vector (nullable = true)
 |-- weather_ohe: vector (nullable = true)
 |-- dist_unit_ohe: vector (nullable = true)
 |-- race_type_ohe: vector (nullable = true)
 |-- sex_ohe: vector (nullable = true)
 |-- med_ohe: vector (nullable = true)
 |-- stk_clm_md_ohe: vector (nullable = true)
 |-- turf_mud_mark_ohe: vector (nullable = true)
 |-- features: vector (nullable = true)



In [84]:
drop_cols = [
    "wps_pool", "distance", "course_cd", "equip", "surface", "trk_cond", "weather", "dist_unit", "race_type", "sex", "med",
    "stk_clm_md", "turf_mud_mark",
    "course_cd_index", "equip_index", "surface_index", "trk_cond_index", "weather_index",
    "dist_unit_index", "race_type_index", "sex_index", "med_index", "stk_clm_md_index", "turf_mud_mark_index",
    "jock_key", "train_key",
    "date_of_birth", "raw_features",
    # Numeric columns now included in features:
    "age_at_race_day", "race_number", "purse", "weight", "start_position", "claimprice", "power",
    "morn_odds", "avgspd", "jock_key_index", "train_key_index", "class_rating", "net_sentiment",
    "avg_spd_sd", "ave_cl_sd", "hi_spd_sd", "pstyerl", "all_starts",
    "all_win", "all_place", "all_show", "all_fourth", "all_earnings", 
    "cond_starts", "cond_win", "cond_place", 
    "cond_show", "cond_fourth", "cond_earnings"]
df_final = df_final.drop(*drop_cols)

In [86]:
# 2. Save as Parquet
processed_data = save_parquet(spark, df_final, "processed_data", parquet_dir)

2024-12-16 14:57:51,688 - INFO - Saving processed_data DataFrame to Parquet at /home/exx/myCode/horse-racing/FoxRiverAIRacing/data/parquet/processed_data.parquet...
2024-12-16 14:57:51,689 - INFO - Schema of processed_data DataFrame:
2024-12-16 14:58:00,263 - INFO - processed_data DataFrame saved successfully.   


In [87]:
processed_data = spark.read.parquet(os.path.join(parquet_dir, "processed_data.parquet"))

In [88]:
processed_data.printSchema()

root
 |-- race_date: date (nullable = true)
 |-- label: integer (nullable = true)
 |-- equip_ohe: vector (nullable = true)
 |-- surface_ohe: vector (nullable = true)
 |-- trk_cond_ohe: vector (nullable = true)
 |-- weather_ohe: vector (nullable = true)
 |-- dist_unit_ohe: vector (nullable = true)
 |-- race_type_ohe: vector (nullable = true)
 |-- sex_ohe: vector (nullable = true)
 |-- med_ohe: vector (nullable = true)
 |-- stk_clm_md_ohe: vector (nullable = true)
 |-- turf_mud_mark_ohe: vector (nullable = true)
 |-- features: vector (nullable = true)



In [89]:
processed_data.count()

770832

#  Modify Data After original run processed as above

It can definitely be instructive to simplify your feature set and see what happens. If the model’s top features heavily revolve around track identity (course_cd_* OHE features), that suggests the model is relying significantly on track-specific patterns. Removing those features might help you understand how robust the model is when it can’t rely on track-based signals.

Why Try Removing course_cd?
	1.	Reduce Overfitting to Specific Tracks:
If course_cd is a high-impact feature, the model might be “memorizing” track-specific patterns that don’t generalize well. Removing it forces the model to rely more on intrinsic horse-level and race-level features (morning odds, net_sentiment, equip, surface, etc.), potentially giving you a model that’s more stable across different tracks.
	2.	Discovering New Important Features:
With course_cd features removed, the model can no longer lean on those easy signals. You’ll see which other features emerge as top contributors. For example, maybe morn_odds, net_sentiment, or equip features increase in relative importance.
	3.	Improve Interpretability:
Without track identity dominating the importance chart, it might be clearer how much impact your newly added features (like ALL_RACES stats or other cumulative metrics) have on predictions.

Approach to Test This:
	•	Remove course_cd and all derived OHE columns from the feature set. This means dropping course_cd, course_cd_index, and all course_cd_ohe_* columns from your vector assembler.
	•	Rerun the model training and compare:
	•	AUC and Accuracy before and after removing course_cd.
	•	Feature importance rankings in the new run.

If performance drastically drops, it means track-based signals were genuinely valuable. If performance remains stable or only slightly worse—but the model’s top features become more horse-performance oriented—then you’ve gained a more track-agnostic model, which might be beneficial in certain scenarios.

Conclusion:

Yes, a better approach (or at least a valuable experiment) would be to re-run the model without the course_cd features and see what happens. This helps you understand the model’s true dependencies and might lead to a more generalizable and insightful set of features.

In [90]:
# train_df, test_df = processed_data.randomSplit([0.8, 0.2], seed=42)

# cutoff_date = "2024-01-01"
# train_df = processed_data.filter(col("race_date") < cutoff_date)
# test_df = processed_data.filter(col("race_date") >= cutoff_date)

In [93]:
from xgboost.spark import SparkXGBClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import to_date, lit, add_months

# Time-based cutoff dates
dates = ["2023-06-30", "2023-09-30", "2023-12-31", "2024-03-31"]  # cutoff dates

fold_metrics = []

# Positive class is label=1
total = processed_data.count()
positives = processed_data.filter(col("label") == 1).count()
negatives = total - positives

# Compute the ratio
ratio = negatives / positives if positives > 0 else 1.0

print("Total examples:", total)
print("Positives (label=1):", positives)
print("Negatives (label=0):", negatives)
print("scale_pos_weight ratio:", ratio)

# Fixed parameters (from your best params scenario)
xgb_params = {
    "max_depth": 6,
    "eta": 0.01,
    "gamma": 0,
    "subsample": 0.6,
    "colsample_bytree": 0.7,
    "min_child_weight": 5,
    "reg_lambda": 1,
    "reg_alpha": 1,
    "num_boost_round": 500, # Adjust if needed
    "verbosity": 2,
    "eval_metric": "auc",
    "scale_pos_weight": ratio
}

for cutoff_str in dates:
    cutoff_date_expr = to_date(lit(cutoff_str), "yyyy-MM-dd")

    # Training data: race_date <= cutoff
    train_df = processed_data.filter(processed_data.race_date <= cutoff_str)

    # Validation: 3 months after cutoff
    validation_window_end_expr = add_months(cutoff_date_expr, 3)
    validation_df = processed_data.filter(
        (processed_data.race_date > cutoff_str) & 
        (processed_data.race_date <= validation_window_end_expr)
    )

    # If no validation data, assign a neutral metric and continue
    if validation_df.count() == 0:
        fold_metrics.append(0.5)
        continue

    # Create the classifier with the chosen parameters
    xgb_model = SparkXGBClassifier(
        features_col="features",
        label_col="label",
        num_workers=16,
        prediction_col="prediction",
        probability_col="probability",
        raw_prediction_col="rawPrediction",
        max_depth=xgb_params["max_depth"],
        eta=xgb_params["eta"],
        gamma=xgb_params["gamma"],
        subsample=xgb_params["subsample"],
        colsample_bytree=xgb_params["colsample_bytree"],
        min_child_weight=xgb_params["min_child_weight"],
        reg_lambda=xgb_params["reg_lambda"],
        reg_alpha=xgb_params["reg_alpha"],
        num_boost_round=xgb_params["num_boost_round"],
        verbosity=xgb_params["verbosity"],
        eval_metric=xgb_params["eval_metric"],
        scale_pos_weight=xgb_params["scale_pos_weight"]
    )

    # Train the model
    model = xgb.fit(train_df)

    # Predict on validation set
    predictions = model.transform(validation_df)

    # Evaluate AUC
    evaluator = BinaryClassificationEvaluator(
        labelCol="label",
        rawPredictionCol="rawPrediction",
        metricName="areaUnderROC"
    )
    auc = evaluator.evaluate(predictions)
    fold_metrics.append(auc)

# Compute average AUC across all folds
avg_auc = sum(fold_metrics) / len(fold_metrics)
print("Average AUC across all folds:", avg_auc)

Total examples: 770832
Positives (label=1): 419073
Negatives (label=0): 351759
scale_pos_weight ratio: 0.839374047003744


[15:11:17] task 5 got new rank 0                                  (0 + 16) / 16]
[15:11:17] task 3 got new rank 1
[15:11:17] task 14 got new rank 2
[15:11:17] task 11 got new rank 3
[15:11:17] task 8 got new rank 4
[15:11:17] task 0 got new rank 5
[15:11:17] task 4 got new rank 6
[15:11:17] task 7 got new rank 7
[15:11:17] task 12 got new rank 8
[15:11:18] task 2 got new rank 9
[15:11:18] task 15 got new rank 10
[15:11:18] task 9 got new rank 11
[15:11:18] task 6 got new rank 12
[15:11:18] task 13 got new rank 13
[15:11:18] task 1 got new rank 14
[15:11:18] task 10 got new rank 15
[15:11:18] INFO: ../src/gbm/gbtree.cc:140: Tree method is automatically selected to be 'approx' for distributed training.[15:11:18] INFO: ../src/gbm/gbtree.cc:140: Tree method is automatically selected to be 'approx' for distributed training.[15:11:18] INFO: ../src/gbm/gbtree.cc:140: Tree method is automatically selected to be 'approx' for distributed training.[15:11:18] INFO: ../src/gbm/gbtree.cc:140: Tree m

[15:11:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.








[15:11:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:1

[15:11:21] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:21] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:21] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:21] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:21] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:21] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:21] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:21] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:21] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:21] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:21] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:21] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:21

[15:11:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:

[15:11:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.






[15:11:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:1

[15:11:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:

[15:11:25] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11

[15:11:27] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:27] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:27] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:27] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:27] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:27] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.




[15:11:27] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:27] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:27] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:27] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:27] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:27] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:1

[15:11:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:1

[15:11:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:11:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:1

[15:11:30] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:30] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:30] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:30] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:30] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:30] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:30] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:30] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:11:30] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:30] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:30] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:30] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11

[15:11:32] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:32] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:32] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:32] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:32] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:32] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:32] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:32] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:32] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:32] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:32] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:32] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:1

[15:11:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.




[15:11:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.



[15:1

[15:11:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:

[15:11:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.




[15:11:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:

[15:11:44] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:44] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:44] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:44] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:44] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:11:44] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:44] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:44] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:44] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:44] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:44] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:44] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:

[15:11:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:

[15:11:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.



[15:11:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11

[15:11:48] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:48] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:48] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:48] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:48] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.



[15:11:48] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:48] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:48] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:48] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:48] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:48] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:48] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:1

[15:11:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:1

[15:11:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.









[15:11

[15:11:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.



[15:11:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:11:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11

[15:11:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:

[15:11:54] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:54] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:54] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:54] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:54] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:54] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:54] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:11:54] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:54] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:54] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:54] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:54] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:1

[15:11:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:1

[15:11:57] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:57] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:57] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:57] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:57] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.




[15:11:57] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:57] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:57] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:57] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:57] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:11:57] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:11:57] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:1

[15:11:59] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:59] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:59] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:59] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:59] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:59] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:59] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:59] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:59] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:59] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:59] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:11:59] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.










[15:1

[15:12:06] INFO: ../rabit/src/allreduce_base.cc:279: task 3 connected to the tracker
[15:12:06] INFO: ../rabit/src/allreduce_base.cc:279: task 13 connected to the tracker
[15:12:06] INFO: ../rabit/src/allreduce_base.cc:279: task 6 connected to the tracker
[15:12:06] INFO: ../rabit/src/allreduce_base.cc:279: task 9 connected to the tracker
[15:12:06] INFO: ../rabit/src/allreduce_base.cc:279: task 14 connected to the tracker
[15:12:06] INFO: ../rabit/src/allreduce_base.cc:279: task 7 connected to the tracker
[15:12:06] INFO: ../rabit/src/allreduce_base.cc:279: task 10 connected to the tracker
[15:12:06] INFO: ../rabit/src/allreduce_base.cc:279: task 4 connected to the tracker
[15:12:06] INFO: ../rabit/src/allreduce_base.cc:279: task 0 connected to the tracker
[15:12:06] INFO: ../rabit/src/allreduce_base.cc:279: task 2 connected to the tracker
[15:12:06] INFO: ../rabit/src/allreduce_base.cc:279: task 5 connected to the tracker
[15:12:06] INFO: ../rabit/src/allreduce_base.cc:279: task 1 co

[15:12:09] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:09] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:09] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:09] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:09] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:09] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:09] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:09] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:09] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:09] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:09] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:09] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:

[15:12:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12

[15:12:11] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:11] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:11] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:11] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:11] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:11] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:11] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:11] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:11] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:11] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:12:11] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:11] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:

[15:12:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:13] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:13] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:13] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:13] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:13] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:13] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:13] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:

[15:12:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:12:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.



[15:12:

[15:12:15] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:15] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:15] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:15] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:15] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:15] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:15] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:15] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:15] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.



[15:12:15] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:15] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:15] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12

[15:12:16] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:16] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:16] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:16] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:17] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:17] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:17] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:12:17] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:17] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:17] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:17] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:12:17] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:

[15:12:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:18] INFO

[15:12:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.



[15:12:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:1

[15:12:20] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:20] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:20] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:20] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:20] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:20] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:20] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:20] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:20] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:20] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:20] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:20] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:2

[15:12:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:22] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:1

[15:12:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:1

[15:12:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:12:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:1

[15:12:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:26] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:

[15:12:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:12:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:12:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:

[15:12:36] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:36] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:36] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:36] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:36] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:12:36] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:36] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:36] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:36] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:36] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:36] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:36] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12

[15:12:37] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:37] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:37] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:37] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:37] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:37] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:37] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:37] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:37] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:12:37] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:37] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:12:37] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:1

[15:12:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:12:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.



[15:12:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:

[15:12:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:12:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:1

[15:12:41] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:41] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:41] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:41] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:41] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:41] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:12:41] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:41] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:41] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:41] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.



[15:12:41] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:41] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:

[15:12:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:12:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:12:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12

[15:12:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:43] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:

[15:12:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:

[15:12:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:12:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:46] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:1

[15:12:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:

[15:12:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:12:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:49] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:

[15:12:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.



[15:12:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:1

[15:12:51] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:51] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:51] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:51] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:51] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:51] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:51] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:51] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:51] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:51] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:51] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:51] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:1

[15:12:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[15:12:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[15:12:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[15:12:53] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[15:

Average AUC across all folds: 0.7339511832794069


                                                                                

Best Params: {'max_depth': 6, 'eta': 0.01, 'gamma': 0, 'subsample': 0.6, 'colsample_bytree': 0.7, 'min_child_weight': 1, 'reg_lambda': 0, 'reg_alpha': 0}
Best AUC: 0.7327572203895388


Best Params: {'max_depth': 6, 'eta': 0.01, 'gamma': 0, 'subsample': 0.6, 'colsample_bytree': 0.7, 'min_child_weight': 1, 'reg_lambda': 0, 'reg_alpha': 0}
Best AUC: 0.7327572203895388

Best Params: {'max_depth': 6, 'eta': 0.01, 'gamma': 0, 'subsample': 0.6, 'colsample_bytree': 0.7, 'min_child_weight': 1, 'reg_lambda': 0, 'reg_alpha': 0}
Best AUC: 0.7327584604076798

Average AUC across all folds: 0.7327587069455473

Average AUC across all folds: 0.7339438971137404

Average AUC across all folds: 0.7339511832794069

In [None]:
# xgb_model = xgb.fit(train_df)

In [92]:
predictions = xgb_model.transform(validation_df)

NameError: name 'xgb_model' is not defined

In [None]:
predictions.printSchema()
predictions.show(10, truncate=False)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"AUC: {auc}")

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

In [None]:
from pyspark.sql.functions import col, mean

# Calculate the proportion of horses finishing in top-4
positive_proportion = validation_df.select(mean(col("label")).alias("pos_rate")).collect()[0]["pos_rate"]
print("Proportion of top-4 finishers in test set:", positive_proportion)

# Naive baseline metrics:
# If we always predict 'not top-4' (label=0), the accuracy = (1 - positive_proportion).
# If we always predict 'top-4' (label=1), the accuracy = positive_proportion.
# Choose the majority class baseline:
majority_class_accuracy = max(positive_proportion, 1 - positive_proportion)
print("Naive majority class baseline accuracy:", majority_class_accuracy)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

predictions = xgb_model.transform(validation_df)

# AUC (Binary)
binary_evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
auc = binary_evaluator.evaluate(predictions)
print("AUC:", auc)

# Accuracy
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = accuracy_evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

# Precision
precision_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="precisionByLabel")
precision = precision_evaluator.evaluate(predictions)
print("Precision (Top-4 class):", precision)

# Recall
recall_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="recallByLabel")
recall = recall_evaluator.evaluate(predictions)
print("Recall (Top-4 class):", recall)

# F1 Score
f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = f1_evaluator.evaluate(predictions)
print("F1 Score:", f1)

## Feature Importance with get_booster().get_score()

In [None]:
feature_metadata = df_assembled.schema["raw_features"].metadata
attrs = feature_metadata["ml_attr"]["attrs"]

def get_feature_names(attrs):
    names = []
    for attr_type in ["numeric", "binary", "nominal"]:
        if attr_type in attrs:
            for a in attrs[attr_type]:
                names.append((a["idx"], a["name"]))
    return [name for idx, name in sorted(names, key=lambda x: x[0])]

feature_names = get_feature_names(attrs)

In [None]:
booster = xgb_model.get_booster()
feature_importances = booster.get_score(importance_type='gain')

mapped_importances = []
for f, importance in feature_importances.items():
    index = int(f[1:])  # remove the 'f'
    if index < len(feature_names):
        feature_name = feature_names[index]
    else:
        feature_name = f"Unknown_{index}"
    mapped_importances.append((feature_name, importance))

mapped_importances.sort(key=lambda x: x[1], reverse=True)

print("Mapped Feature Importances:")
for name, imp in mapped_importances:
    print(name, imp)

## Error Analysis: Incorrect Predictions


In [None]:
from pyspark.sql.functions import col

# Predicted top-4 (prediction=1), actually not top-4 (label=0)
false_positives = predictions.filter((col("prediction") == 1) & (col("label") == 0))
false_positives.show(10)

# Predicted not top-4 (prediction=0), actually top-4 (label=1)
false_negatives = predictions.filter((col("prediction") == 0) & (col("label") == 1))
false_negatives.show(10)

## Inspect these subsets to see if there’s a pattern. For example, check if they occur at certain tracks, or with certain distances:

In [None]:
false_positives.groupBy("course_cd_ohe").count().orderBy(col("count").desc()).show(10,False)
false_negatives.groupBy("course_cd_ohe").count().orderBy(col("count").desc()).show(10,False)