In [103]:
# Setup Environment

import os
import logging
import xgboost
from pyspark.sql.functions import (col, count, row_number, abs, unix_timestamp, mean, 
                                   when, lit, min as spark_min, max as spark_max , 
                                   row_number, mean, countDistinct, last, first)
import configparser
from pyspark.sql import SparkSession
from src.data_preprocessing.data_prep1.sql_queries import sql_queries
from src.data_preprocessing.data_prep1.data_loader import load_data_from_postgresql, reload_parquet_files
from pyspark.sql.window import Window
from pyspark.sql import DataFrame, Window
from src.data_preprocessing.data_prep1.data_utils import (save_parquet, gather_statistics, 
                initialize_environment, load_config, initialize_logging, initialize_spark, 
                drop_duplicates_with_tolerance, identify_and_impute_outliers, 
                identify_and_remove_outliers, identify_missing_and_outliers)
# Set global references to None
spark = None
master_results_df = None

In [104]:
spark, jdbc_url, jdbc_properties, queries, parquet_dir, log_file = initialize_environment()

2024-12-14 00:52:33,955 - INFO - Environment setup initialized.
2024-12-14 00:52:33,958 - INFO - Spark session created successfully.


In [105]:
results = spark.read.parquet(os.path.join(parquet_dir, "results.parquet"))

In [106]:
results.printSchema()

root
 |-- horse_id: integer (nullable = true)
 |-- course_cd: string (nullable = true)
 |-- race_date: date (nullable = true)
 |-- race_number: integer (nullable = true)
 |-- saddle_cloth_number: string (nullable = true)
 |-- official_fin: integer (nullable = true)
 |-- split_num: integer (nullable = true)
 |-- earnings: integer (nullable = true)
 |-- purse: integer (nullable = true)
 |-- wps_pool: decimal(10,2) (nullable = true)
 |-- dollar_odds: double (nullable = true)
 |-- weight: integer (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- sex: string (nullable = true)
 |-- horse_name: string (nullable = true)
 |-- start_position: integer (nullable = true)
 |-- equip: string (nullable = true)
 |-- claim_price: double (nullable = true)
 |-- surface: string (nullable = true)
 |-- surface_type_description: string (nullable = true)
 |-- trk_cond: string (nullable = true)
 |-- trk_cond_desc: string (nullable = true)
 |-- weather: string (nullable = true)
 |-- distance: dec

In [107]:
results = results.drop("split_num")

# Data Analysis

Before beginning here, go to /src/data_preprocessing/data_prep2/main_data_prep2.py and run data_check and cardinality reports. 

### total_strides: Impute and Interpolation

In [108]:
from pyspark.sql.functions import col, when, avg

# List of columns with missing values
columns_with_missing = ["avg_stride_length", "total_race_time", "total_strides"]

# Step 1: Compute group-wise averages
group_means = results.groupBy("official_fin").agg(
    avg("avg_stride_length").alias("avg_stride_length_mean"),
    avg("total_race_time").alias("total_race_time_mean"),
    avg("total_strides").alias("total_strides_mean")
)

# Step 2: Join the group means back to the original DataFrame
results_with_means = results.join(group_means, on="official_fin", how="left")

# Step 3: Replace missing values with group means
for column in columns_with_missing:
    mean_column = f"{column}_mean"  # Name of the group mean column
    results_with_means = results_with_means.withColumn(
        column,
        when(col(column).isNull(), col(mean_column)).otherwise(col(column))
    )

# Drop the temporary mean columns if needed
results = results_with_means.drop("avg_stride_length_mean", "total_race_time_mean", "total_strides_mean")


In [109]:
# total_strides
# Show the updated DataFrame
results.filter(col("total_strides").isNull()).count()

0

In [110]:
# total_race_time
results.filter(col("total_race_time").isNull()).count()

0

In [111]:
# avg_stride_length
results.filter(col("avg_stride_length").isNull()).count()

0

### distance: Impute 

This one will need some sort of comparison to see what the distance is. In other words, finishing time, or something like that. 

#### Step-by-Step Solution:

	1.	Calculate the Average total_race_time for Each Distance:
	•	Group by distance and compute the mean of total_race_time.
	2.	Join the Averages Back to the Data:
	•	Use the grouped averages as a reference to align each row with its average total_race_time.
	3.	Estimate Missing distance Values:
	•	For rows where distance is missing, identify the closest match in total_race_time within the ±5-second tolerance.
	4.	Assign the Imputed distance:
	•	Populate the missing distance values based on the above matching.


In [112]:
from pyspark.sql.functions import col, abs, avg, when, row_number
from pyspark.sql.window import Window

# Step 1: Calculate the average total_race_time for each distance
distance_time_avg = results.groupBy("distance").agg(
    avg("total_race_time").alias("avg_race_time")
)

# Step 2: Rename `distance` in the `potential_matches` DataFrame to avoid ambiguity
potential_matches = distance_time_avg.withColumnRenamed("distance", "imputed_distance")

# Step 3: Identify rows with missing distance
missing_distance_rows = results.filter(col("distance").isNull())
non_missing_rows = results.filter(col("distance").isNotNull())

# Step 4: Perform a cross join between rows with missing distance and potential matches
crossed = missing_distance_rows.crossJoin(potential_matches)

# Step 5: Add a column for the absolute difference between total_race_time and avg_race_time
crossed = crossed.withColumn(
    "time_diff", abs(col("total_race_time") - col("avg_race_time"))
)

# Step 6: Filter to keep only rows within the ±5-second tolerance
crossed_filtered = crossed.filter(col("time_diff") <= 5)

# Step 7: Rank matches by time_diff and keep the closest match
# Create a window to rank matches by time_diff within each composite key
window = Window.partitionBy(
    "course_cd", "race_date", "race_number", "saddle_cloth_number"
).orderBy(col("time_diff"))

crossed_ranked = crossed_filtered.withColumn("rank", row_number().over(window))

# Keep only the closest match for each missing row
best_matches = crossed_ranked.filter(col("rank") == 1).select(
    "course_cd", "race_date", "race_number", "saddle_cloth_number", col("imputed_distance").alias("imputed_distance")
)

# Step 8: Update the original DataFrame with the imputed distances
results = results.join(
    best_matches,
    on=["course_cd", "race_date", "race_number", "saddle_cloth_number"],
    how="left"
).withColumn(
    "distance",
    when(col("distance").isNull(), col("imputed_distance")).otherwise(col("distance"))
).drop("imputed_distance", "avg_race_time")


In [113]:
# distance
results.filter(col("distance").isNull()).count()

0

### derived_favorite: Imputing with the mean

In [114]:
from pyspark.sql.functions import col, mean, when

# Calculate the mean of the 'wps_pool' column, excluding nulls
mean_value = results.select(mean(col("derived_favorite")).alias("mean_derived_favorite")).collect()[0]["mean_derived_favorite"]

# Replace null values in 'wps_pool' with the calculated mean
results = results.withColumn(
    "derived_favorite",
    when(col("derived_favorite").isNull(), mean_value).otherwise(col("derived_favorite"))
)

# Show the updated DataFrame
results.filter(col("derived_favorite").isNull()).count()

0

### date_of_birth: Impute with Global median

In [116]:
from pyspark.sql.functions import col, lit, count, expr
from pyspark.sql.window import Window

# Convert date_of_birth to a numeric timestamp for median calculation
results = results.withColumn("date_of_birth_ts", col("date_of_birth").cast("timestamp").cast("long"))

# Calculate the median of date_of_birth
median_window = Window.orderBy("date_of_birth_ts")
row_count = results.filter(col("date_of_birth_ts").isNotNull()).count()

if row_count % 2 == 0:  # Even number of rows
    median_row_1 = row_count // 2
    median_row_2 = median_row_1 + 1
    median_ts = results.filter(col("date_of_birth_ts").isNotNull()) \
        .select("date_of_birth_ts") \
        .withColumn("row_num", expr("row_number() over (ORDER BY date_of_birth_ts)")) \
        .filter((col("row_num") == median_row_1) | (col("row_num") == median_row_2)) \
        .groupBy().agg(expr("avg(date_of_birth_ts)").alias("median_ts")) \
        .collect()[0]["median_ts"]
else:  # Odd number of rows
    median_row = (row_count + 1) // 2
    median_ts = results.filter(col("date_of_birth_ts").isNotNull()) \
        .select("date_of_birth_ts") \
        .withColumn("row_num", expr("row_number() over (ORDER BY date_of_birth_ts)")) \
        .filter(col("row_num") == median_row) \
        .collect()[0]["date_of_birth_ts"]

# Convert median timestamp back to date
median_date = lit(expr(f"CAST(FROM_UNIXTIME({median_ts}) AS DATE)"))

# Fill missing values with the global median date
results = results.withColumn(
    "date_of_birth",
    when(col("date_of_birth").isNull(), median_date).otherwise(col("date_of_birth"))
).drop("date_of_birth_ts")

                                                                                

In [117]:
# Show the updated DataFrame
results.filter(col("date_of_birth").isNull()).count()

0

## Convert DOB to AGE_AT_RACE_DAY

In [118]:
from pyspark.sql.functions import col, datediff, expr

# Ensure both date_of_birth and race_date are in date format
results = results.withColumn("date_of_birth", col("date_of_birth").cast("date"))
results = results.withColumn("race_date", col("race_date").cast("date"))

# Calculate age in days, then convert to years
results = results.withColumn(
    "age_at_race_day",
    datediff(col("race_date"), col("date_of_birth")) / 365.25  # Convert days to years
)

# Show the updated DataFrame
results.select("date_of_birth", "race_date", "age_at_race_day").show(5)

+-------------+----------+-----------------+
|date_of_birth| race_date|  age_at_race_day|
+-------------+----------+-----------------+
|   2019-04-13|2023-05-18|4.095824777549623|
|   2021-04-13|2024-07-19|3.266255989048597|
|   2021-04-13|2024-07-19|3.266255989048597|
|   2019-05-12|2024-07-25|5.204654346338125|
|   2019-05-12|2024-07-25|5.204654346338125|
+-------------+----------+-----------------+
only showing top 5 rows



### Encoding Weather

In [119]:
# Remember to pay attention to case
results = results.fillna({"weather": "Clear"})
results.filter(col("weather").isNull()).count()

0

In [120]:
results.select("weather").distinct().count()
# Count the occurrences of each distinct value in the "weather" column
distinct_value_counts = results.groupBy("weather").count()

# Show the result
distinct_value_counts.show()

+-------+-------+
|weather|  count|
+-------+-------+
|  Foggy|  12709|
| Cloudy|1847024|
|Showery| 174941|
|  Clear|2676015|
|  Rainy| 101603|
|Snowing|  11041|
|   Hazy|  35118|
+-------+-------+





### wps_pool: Imputing with mean

In [121]:
from pyspark.sql.functions import col, mean, when

# Calculate the mean of the 'wps_pool' column, excluding nulls
mean_value = results.select(mean(col("wps_pool")).alias("mean_wps_pool")).collect()[0]["mean_wps_pool"]

# Replace null values in 'wps_pool' with the calculated mean
results = results.withColumn(
    "wps_pool",
    when(col("wps_pool").isNull(), mean_value).otherwise(col("wps_pool"))
)

# Show the updated DataFrame
results.filter(col("wps_pool").isNull()).count()

0

### Imputing Total Race Time

In [122]:
results.filter(col("total_race_time").isNull()).count()

0

### equip: Conversion and Imputation

In [123]:
results = results.fillna({"equip": "No_Equip"})
results.filter(col("equip").isNull()).count()

0

### earnings: Imputation

In [124]:
results = results.fillna({"earnings": 0})
results.filter(col("earnings").isNull()).count()

0

### trk_cond: Impute

In [125]:
cols = ["trk_cond", "trk_cond_desc"] 
results.select(cols).distinct().count()

# Count the occurrences of each distinct value in the "weather" column
distinct_value_counts = results.groupBy(cols).count()

# Show the result
distinct_value_counts.show()




+--------+-------------+-------+
|trk_cond|trk_cond_desc|  count|
+--------+-------------+-------+
|      SY|       sloppy| 163082|
|    null|         null|    274|
|      FM|         firm|1194446|
|      SF|         soft|   2202|
|      YL|     yielding|   8988|
|      GD|         good| 285172|
|      FT|         fast|3071977|
|      FZ|       frozen|     48|
|      MY|        muddy| 122254|
|      WF|     wet fast|  10008|
+--------+-------------+-------+



                                                                                

In [126]:
cols = ["trk_cond", "trk_cond_desc"]

# Fill missing values with "MISSING" for the specified columns
results = results.fillna({col: "MISSING" for col in cols})

# Verify no null values remain
missing_count = results.filter(
    (col("trk_cond").isNull()) | (col("trk_cond_desc").isNull())
).count()

print(f"Number of missing values: {missing_count}")

Number of missing values: 0


In [127]:
results = results.drop("trk_cond_desc")

### Encoding Sex

Horse Sex 	Code	Description
	C	Colt
	F	Filly
	G	Gelding
	H	Horse
	M	Mare
	R	Ridgling
	B	Spayed Mare

In [128]:
# "equip", "surface", "surface_type_description", "trk_cond", "trk_cond_desc", "weather", "dist_unit", "race_type"]

cols = ["sex"] 

distinct_values = results.select(*cols).distinct()
distinct_values.show(50)


+---+
|sex|
+---+
|  F|
|  B|
|  M|
|  C|
|  R|
|  G|
|  H|
+---+



In [129]:
results.filter(col("sex").isNull()).count()

0

In [130]:
results = results.drop("surface_type_description")
results = results.drop("trk_cond_desc")

In [131]:
results = results.drop("horse_name")

In [132]:
# results = results.drop("saddle_cloth_number")

In [133]:
cols = ["race_type"] 

distinct_values = results.select(*cols).distinct()
distinct_values.show(50)


+--------------------+
|           race_type|
+--------------------+
|     Maiden Claiming|
|               Final|
|Waiver Maiden Cla...|
|Allowance Optiona...|
|            Claiming|
|      Starter Stakes|
|    Starter Handicap|
|   Optional Claiming|
|Starter Optional ...|
|   Starter Allowance|
|           Allowance|
|              Stakes|
|            Handicap|
|Maiden Special We...|
|Maiden Optional C...|
|     Waiver Claiming|
|     Claiming Stakes|
+--------------------+



In [134]:
save_parquet(spark, results, "results_pan", parquet_dir)

2024-12-14 00:54:23,495 - INFO - Saving results_pan DataFrame to Parquet at /home/exx/myCode/horse-racing/FoxRiverAIRacing/data/parquet/results_pan.parquet...
2024-12-14 00:54:23,496 - INFO - Schema of results_pan DataFrame:
2024-12-14 00:54:30,568 - INFO - results_pan DataFrame saved successfully.      


# OHE and Prep for XGBoost Trainining in Spark

In [135]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
import datetime


In [136]:
# Convert official_fin to binary label for top-4 finish
results = results.withColumn("label", when(col("official_fin") <= 4, 1).otherwise(0))

In [137]:
# Categorical columns equip, surface, trk_cond, weather, dist_unit, race_type 
categorical_cols = ["equip", "surface", "trk_cond", "weather", "dist_unit", "race_type", "sex"]
indexers = [StringIndexer(inputCol=c, outputCol=c+"_index", handleInvalid="keep") for c in categorical_cols]
encoders = [OneHotEncoder(inputCols=[c+"_index"], outputCols=[c+"_ohe"]) for c in categorical_cols]


In [138]:
# Numeric columns
numeric_cols = ["age_at_race_day", "race_number", "earnings", "purse", "dollar_odds", "weight", "start_position", "claim_price", "speed_rating", "class_rating", "total_race_time", "total_strides", "avg_stride_length"]


### Spark Pipeline

In [139]:
# Create a pipeline to transform data
preprocessing_stages = indexers + encoders
pipeline = Pipeline(stages=preprocessing_stages)
model = pipeline.fit(results)
df_transformed = model.transform(results)

                                                                                

In [140]:
ohe_cols = [c+"_ohe" for c in categorical_cols]

In [141]:
assembler = VectorAssembler(inputCols=numeric_cols + ohe_cols, outputCol="raw_features")
df_assembled = assembler.transform(df_transformed)


In [142]:
scaler = StandardScaler(inputCol="raw_features", outputCol="features", withMean=True, withStd=True)
scaler_model = scaler.fit(df_assembled)
df_final = scaler_model.transform(df_assembled)

                                                                                

In [143]:
df_final.printSchema()

root
 |-- course_cd: string (nullable = true)
 |-- race_date: date (nullable = true)
 |-- race_number: integer (nullable = true)
 |-- saddle_cloth_number: string (nullable = true)
 |-- official_fin: integer (nullable = true)
 |-- horse_id: integer (nullable = true)
 |-- earnings: integer (nullable = false)
 |-- purse: integer (nullable = true)
 |-- wps_pool: decimal(14,6) (nullable = true)
 |-- dollar_odds: double (nullable = true)
 |-- weight: integer (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- sex: string (nullable = true)
 |-- start_position: integer (nullable = true)
 |-- equip: string (nullable = false)
 |-- claim_price: double (nullable = true)
 |-- surface: string (nullable = true)
 |-- trk_cond: string (nullable = false)
 |-- weather: string (nullable = false)
 |-- distance: decimal(10,2) (nullable = true)
 |-- dist_unit: string (nullable = true)
 |-- derived_favorite: decimal(14,6) (nullable = true)
 |-- race_type: string (nullable = true)
 |-- class_rati

In [144]:
drop_cols = ["equip", "surface", "trk_cond", "weather", "dist_unit", "race_type", "sex", "date_of_birth"]
df_final = df_final.drop(*drop_cols)

In [145]:
# Label replaces official_fin
df_final = df_final.drop("official_fin")

In [146]:
# Is this telling the model who worn? Remove! total_race_time
df_final = df_final.drop("total_race_time")

In [147]:
# These are included in the features column

numeric_cols = ["age_at_race_day", "race_number", "earnings", "purse", "dollar_odds", "weight", "start_position", "claim_price", "speed_rating", "class_rating", "total_strides", "avg_stride_length"]

df_final = df_final.drop(*numeric_cols)

In [148]:
# Drop indexes:
drop_index = ["equip_index", "surface_index", "trk_cond_index", "weather_index", "dist_unit_index", "race_type_index", "sex_index"]
df_final = df_final.drop(*drop_index)

In [149]:

# Drop unnecessary columns
columns_to_drop = ["raw_features"]
df_final = df_final.drop(*columns_to_drop)

# Drop original categorical columns and index columns if they exist
original_categorical_cols = ["equip", "surface", "trk_cond", "weather", "dist_unit", "race_type", "sex"]
index_columns = [f"{col}_index" for col in original_categorical_cols]

df_final = df_final.drop(*original_categorical_cols, *index_columns)

# Drop 'official_fin' if still present and replaced by 'label'
# Assuming 'official_fin' has already been converted to 'label', if not, drop it here
# df = df.drop("official_fin")

# Select relevant columns to keep
final_columns = ["course_cd", "race_date", "saddle_cloth_number",
    "horse_id", "wps_pool", "distance", "derived_favorite",
    "label",
    "equip_ohe", "surface_ohe", "trk_cond_ohe",
    "weather_ohe", "dist_unit_ohe", "race_type_ohe",
    "sex_ohe", "features"
]

df_final = df_final.select(*final_columns)

# Optional: Verify the schema
df_final.printSchema()

root
 |-- course_cd: string (nullable = true)
 |-- race_date: date (nullable = true)
 |-- saddle_cloth_number: string (nullable = true)
 |-- horse_id: integer (nullable = true)
 |-- wps_pool: decimal(14,6) (nullable = true)
 |-- distance: decimal(10,2) (nullable = true)
 |-- derived_favorite: decimal(14,6) (nullable = true)
 |-- label: integer (nullable = false)
 |-- equip_ohe: vector (nullable = true)
 |-- surface_ohe: vector (nullable = true)
 |-- trk_cond_ohe: vector (nullable = true)
 |-- weather_ohe: vector (nullable = true)
 |-- dist_unit_ohe: vector (nullable = true)
 |-- race_type_ohe: vector (nullable = true)
 |-- sex_ohe: vector (nullable = true)
 |-- features: vector (nullable = true)



In [150]:
# 2. Save as Parquet
save_parquet(spark, df_final, "processed_data", parquet_dir)

2024-12-14 00:55:27,807 - INFO - Saving processed_data DataFrame to Parquet at /home/exx/myCode/horse-racing/FoxRiverAIRacing/data/parquet/processed_data.parquet...
2024-12-14 00:55:27,808 - INFO - Schema of processed_data DataFrame:
2024-12-14 00:55:46,127 - INFO - processed_data DataFrame saved successfully.   


In [151]:
processed_data = spark.read.parquet(os.path.join(parquet_dir, "processed_data.parquet"))

In [152]:
processed_data = processed_data.drop("saddle_cloth_number")

In [155]:
processed_data = processed_data.drop("course_cd")

In [156]:
processed_data.printSchema()

root
 |-- race_date: date (nullable = true)
 |-- horse_id: integer (nullable = true)
 |-- wps_pool: decimal(14,6) (nullable = true)
 |-- distance: decimal(10,2) (nullable = true)
 |-- derived_favorite: decimal(14,6) (nullable = true)
 |-- label: integer (nullable = true)
 |-- equip_ohe: vector (nullable = true)
 |-- surface_ohe: vector (nullable = true)
 |-- trk_cond_ohe: vector (nullable = true)
 |-- weather_ohe: vector (nullable = true)
 |-- dist_unit_ohe: vector (nullable = true)
 |-- race_type_ohe: vector (nullable = true)
 |-- sex_ohe: vector (nullable = true)
 |-- features: vector (nullable = true)



In [157]:
processed_data.count()

4858451

In [158]:
processed_data = processed_data.drop("wps_pool", "distance", "derived_favorite")

In [159]:
processed_data.printSchema()

root
 |-- race_date: date (nullable = true)
 |-- horse_id: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- equip_ohe: vector (nullable = true)
 |-- surface_ohe: vector (nullable = true)
 |-- trk_cond_ohe: vector (nullable = true)
 |-- weather_ohe: vector (nullable = true)
 |-- dist_unit_ohe: vector (nullable = true)
 |-- race_type_ohe: vector (nullable = true)
 |-- sex_ohe: vector (nullable = true)
 |-- features: vector (nullable = true)



In [162]:
# train_df, test_df = processed_data.randomSplit([0.8, 0.2], seed=42)

cutoff_date = "2024-01-01"
train_df = processed_data.filter(col("race_date") < cutoff_date)
test_df = processed_data.filter(col("race_date") >= cutoff_date)

In [163]:
from xgboost.spark import SparkXGBClassifier

xgb = SparkXGBClassifier(
    features_col="features",
    label_col="label",
    num_workers=16,
    prediction_col="prediction",
    probability_col="probability",
    raw_prediction_col="rawPrediction",
    num_boost_round=100,
    verbosity=2,  # or 3 for more details
    eval_metric="auc",
    max_depth=6,
    eta=0.3,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8
)



[01:02:03] task 3 got new rank 0                                  (0 + 16) / 16]
[01:02:03] task 0 got new rank 1
[01:02:03] task 4 got new rank 2
[01:02:03] task 6 got new rank 3
[01:02:03] task 1 got new rank 4
[01:02:04] task 11 got new rank 5
[01:02:04] task 13 got new rank 6
[01:02:04] task 7 got new rank 7
[01:02:04] task 2 got new rank 8
[01:02:04] task 9 got new rank 9
[01:02:04] task 5 got new rank 10
[01:02:04] task 10 got new rank 11
[01:02:04] task 15 got new rank 12
[01:02:04] task 8 got new rank 13
[01:02:04] task 14 got new rank 14
[01:02:04] task 12 got new rank 15
[01:02:05] INFO: ../src/gbm/gbtree.cc:140: Tree method is automatically selected to be 'approx' for distributed training.[01:02:05] INFO: ../src/gbm/gbtree.cc:140: Tree method is automatically selected to be 'approx' for distributed training.

[01:02:05] INFO: ../src/gbm/gbtree.cc:140: Tree method is automatically selected to be 'approx' for distributed training.[01:02:05] INFO: ../src/gbm/gbtree.cc:140: Tree

[01:02:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:02:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:

[01:02:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:02:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:14] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:0

[01:02:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:02:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[01:02:19] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:

[01:02:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:02:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:02:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:02:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:24] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:

[01:02:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:02:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:02:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:28] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02

[01:02:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:02:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:33] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02

[01:02:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:02:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:38] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[01:

[01:02:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[01:02:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:02:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:02:42] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:

[01:02:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:47] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:0

[01:02:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:02:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:02:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:02:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:52] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:

[01:02:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:02:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:02:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[01:02:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:02:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:02:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:0

[01:03:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:03:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:03:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:03:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:03:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:03:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:03:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:03:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[01:03:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:03:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:03:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:03:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:03

[01:03:05] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:03:05] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:03:05] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:03:05] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:03:05] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:03:05] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:03:05] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:03:05] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:03:05] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:03:05] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:03:05] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:03:05] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:

[01:03:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:03:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:03:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:03:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:03:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[01:03:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:03:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:03:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:03:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:03:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[01:03:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[01:03:10] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[01:0

In [87]:
xgb_model = xgb.fit(train_df)

[00:32:06] INFO: ../rabit/src/allreduce_base.cc:279: task 2 connected to the tracker
[00:32:06] INFO: ../rabit/src/allreduce_base.cc:279: task 5 connected to the tracker
[00:32:06] INFO: ../rabit/src/allreduce_base.cc:279: task 10 connected to the tracker
[00:32:06] INFO: ../rabit/src/allreduce_base.cc:279: task 3 connected to the tracker
[00:32:06] INFO: ../rabit/src/allreduce_base.cc:279: task 13 connected to the tracker
[00:32:06] INFO: ../rabit/src/allreduce_base.cc:279: task 1 connected to the tracker
[00:32:06] INFO: ../rabit/src/allreduce_base.cc:279: task 14 connected to the tracker
[00:32:06] INFO: ../rabit/src/allreduce_base.cc:279: task 7 connected to the tracker
[00:32:06] INFO: ../rabit/src/allreduce_base.cc:279: task 9 connected to the tracker
[00:32:06] INFO: ../rabit/src/allreduce_base.cc:279: task 15 connected to the tracker
[00:32:06] INFO: ../rabit/src/allreduce_base.cc:279: task 6 connected to the tracker
[00:32:06] INFO: ../rabit/src/allreduce_base.cc:279: task 0 c

[00:32:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:32:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:32:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:32:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:

[00:32:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:

[00:32:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:32:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:32:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:32:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:32:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:

[00:32:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:32:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:32:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[00:32:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:32:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:32:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:29] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:

[00:32:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:32:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:32:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[00:32:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:32:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:32:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:32:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:32:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:32:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:32:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:34] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:

[00:32:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:32:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:32:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:32:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:32:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:32:40] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:

[00:32:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:32:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:32:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:45] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:

[00:32:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:32:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:32:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:32:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:32:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:50] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:51] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:51] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:51] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:32:51] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:

[00:32:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:32:56] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:

[00:33:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:33:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:33:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:01] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:

[00:33:07] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:07] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:07] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:07] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:07] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:07] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:07] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:07] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:07] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:07] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:07] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:07] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:

[00:33:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:33:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:33:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:12] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:

[00:33:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:33:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:33:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.


[00:33:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:33:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:33:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:33:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:33:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:18] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:

[00:33:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:33:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:33:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.[00:33:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.

[00:33:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:33:23] INFO: ../src/data/simple_dmatrix.cc:103: Generating new Gradient Index.
[00:

In [88]:
predictions = xgb_model.transform(test_df)

In [89]:
predictions.printSchema()
predictions.show(10, truncate=False)

root
 |-- horse_id: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- equip_ohe: vector (nullable = true)
 |-- surface_ohe: vector (nullable = true)
 |-- trk_cond_ohe: vector (nullable = true)
 |-- weather_ohe: vector (nullable = true)
 |-- dist_unit_ohe: vector (nullable = true)
 |-- race_type_ohe: vector (nullable = true)
 |-- sex_ohe: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- prediction: double (nullable = true)
 |-- probability: vector (nullable = true)





+--------+-----+---------------+-------------+--------------+-------------+-------------+--------------+-------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [90]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"AUC: {auc}")

                                                                                

AUC: 0.9999674354826319


In [91]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")



Accuracy: 0.9978842584294894


                                                                                

In [92]:
processed_data.columns

['horse_id',
 'label',
 'equip_ohe',
 'surface_ohe',
 'trk_cond_ohe',
 'weather_ohe',
 'dist_unit_ohe',
 'race_type_ohe',
 'sex_ohe',
 'features']

NameError: name 'assembler' is not defined