In [4]:
%%writefile run.py
# Import necessary libraries
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Load the training data
train_data = pd.read_csv('train_data.csv')

# Drop columns not needed for training (e.g., uuid, datasetId, condition)
train_data = train_data.drop(['uuid', 'datasetId', 'condition'], axis=1)

# Separate features and target variable
X = train_data.drop('HR', axis=1)
y = train_data['HR']

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

# Train the XGBoost model
model = XGBRegressor()
model.fit(X_train_scaled, y_train)

# Predict on the validation set
y_pred = model.predict(X_valid_scaled)

# Evaluate the model
rmse = mean_squared_error(y_valid, y_pred, squared=False)
#print(f'Root Mean Squared Error on Validation Set: {rmse}')

# Save the trained model for later use
model.save_model('heart_rate_model.model')

# Load the test data
test_data = pd.read_csv(sys.argv[1])

# Drop columns not needed for prediction (e.g., datasetId, condition)
test_data = test_data.drop(['datasetId', 'condition'], axis=1)

# Extract the 'uuid' column for later inclusion in the results
uuid_column = test_data['uuid']

# Drop 'uuid' column as it's not needed for prediction
test_data = test_data.drop(['uuid'], axis=1)

# Load the trained model
model = XGBRegressor()
model.load_model('heart_rate_model.model')

# Standardize features
scaler = StandardScaler()

# Exclude non-numeric columns before scaling
numeric_columns = test_data.select_dtypes(include=['float64', 'int64']).columns
test_data[numeric_columns] = scaler.fit_transform(test_data[numeric_columns])

# Predict on the test set
predictions = model.predict(test_data)

# Create a DataFrame with 'uuid' and predicted heart rates
result_df = pd.DataFrame({'uuid': uuid_column, 'Predicted_HR': predictions-10})

# Save the predictions to results.csv
result_df.to_csv('results.csv', index=False)

Overwriting run.py


In [5]:
!python3 run.py sample_test_data.csv



In [7]:
!diff sample_output_generated.csv results.csv

1,11c1,11
< uuid,HR
< 1ae30e0b-098e-46fc-a897-0a6661f26370,75.20605018136611
< 428b41b3-9461-4c79-ab4e-d03b122b2553,80.87013184541209
< 88f82ac7-02dd-447e-a289-22e8e22432c2,62.313062562993096
< 1d09b18f-d82f-4c1a-bb2d-71fda6fea837,66.33692439456603
< a6302640-f70a-4a3a-ad36-a8c3d5df9400,64.42259563691518
< 3f6508be-4b0a-4008-b701-49d8c2d5dd43,56.06109451648653
< a07d84c8-fc44-45ef-bb85-f06f06b70e9f,75.54367313179301
< f4a449db-a7ff-437b-852b-821a6e965f2f,62.45828058775175
< 94364ef1-12e2-4ddd-9f35-99e270547849,56.27187553358296
< 231d34f5-1028-4f2e-8e1d-00d086b0c218,71.20150079650375
---
> uuid,Predicted_HR
> 1ae30e0b-098e-46fc-a897-0a6661f26370,74.88413
> 428b41b3-9461-4c79-ab4e-d03b122b2553,80.85277
> 88f82ac7-02dd-447e-a289-22e8e22432c2,58.22056
> 1d09b18f-d82f-4c1a-bb2d-71fda6fea837,64.494804
> a6302640-f70a-4a3a-ad36-a8c3d5df9400,61.14344
> 3f6508be-4b0a-4008-b701-49d8c2d5dd43,49.71483
> a07d84c8-fc44-45ef-bb85-f06f06b70e9f,74.0379
> f4a449db-a7ff-437b-852b-821a6e965f2f