In [2]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.1/99.8 MB 544.7 kB/s eta 0:03:04
   ---------------------------------------- 0.1/99.8 MB 1.1 MB/s eta 0:01:34
   ---------------------------------------- 0.2/99.8 MB 1.5 MB/s eta 0:01:06
   ---------------------------------------- 0.4/99.8 MB 1.7 MB/s eta 0:00:58
   ---------------------------------------- 0.5/99.8 MB 2.0 MB/s eta 0:00:51
   ---------------------------------------- 0.7/99.8 MB 2.1 MB/s eta 0:00:48
   ---------------------------------------- 0.8/99.8 MB 2.3 MB/s eta 0:00:44
   ------------------

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import joblib

# Load your dataset
data = pd.read_csv('train.csv')

# Inspect the data
print("Training Data:")
print(data.head())

# Define features and target variable
X = data.drop('triglyceride_lvl', axis=1)  # features
y = data['triglyceride_lvl']  # target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing for numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(objective='reg:squarederror'))
])

# Define the parameter grid for XGBoost
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [3, 4, 5],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__subsample': [0.7, 0.8, 0.9]
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print the best parameters found by Grid Search
print(f"Best parameters: {grid_search.best_params_}")

# Use the best model
best_model = grid_search.best_estimator_

# Save the best model to a file
joblib.dump(best_model, 'best_triglyceride_model.pkl')

# Predict on the test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Improved Mean Absolute Error: {mae}')
print(f'Improved Mean Squared Error: {mse}')
print(f'Improved R² Score: {r2}')

Training Data:
  candidate_id gender  age  height_in_cm  weight_in_lbs  left_eyesight_lvl  \
0        CAN_1   Male   35           170         165.35                1.0   
1        CAN_2   Male   30           180         176.37                0.9   
2        CAN_3   Male   40           165         165.35                1.2   
3        CAN_4   Male   50           175         176.37                1.5   
4        CAN_5   Male   50           165         132.28                1.0   

   right_eyesight_lvl   can_hear_left_ear  can_hear_right_ear  \
0                 1.0  Slightly Defective  Slightly Defective   
1                 1.2  Slightly Defective  Slightly Defective   
2                 1.5  Slightly Defective  Slightly Defective   
3                 1.2  Slightly Defective  Slightly Defective   
4                 1.2  Slightly Defective  Slightly Defective   

   blood_pressure_lvl1  ...  bad_cholestrol_lvl  hemoglobin_lvl  urea_lvl  \
0                120.0  ...               126.0 

In [4]:
# Load the test data
test_data = pd.read_csv('test.csv')

model = joblib.load('best_triglyceride_model.pkl')

In [5]:
# Predict on the new test data
triglyceride_pred = model.predict(test_data)

# Prepare the results dataframe
results = pd.DataFrame({
    'candidate_id': test_data['candidate_id'],
    'triglyceride_lvl': triglyceride_pred
})

In [6]:
# Save the results to a CSV file
results.to_csv('predictions_2.csv', index=False)