In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
import numpy as np



In [2]:
data_df = pd.read_csv('data_interpolated.csv', encoding='utf-8-sig')
print(data_df.shape)
data_df.head()

(31377, 639)


Unnamed: 0,k_2_by,k_4_by,k_6_by,k_8_by,k_22_by,k_23_by,k_24_by,k_25_by,k_26_by,k_28_by,...,e_326,e_329,e_330,e_332,e_334,e_343,death_year,inferred_birth_year,person_id,birth_year
0,1484.0,1661.0,1714.0,1491.0,1364.5,1485.0,1404.5,1479.5,1479.5,1477.5,...,1776.5,1798.5,1814.0,1778.0,1793.0,1800.0,1824.0,1763.0,85175,1752.0
1,1381.0,1558.0,1611.0,1388.0,1261.5,1382.0,1301.5,1376.5,1376.5,1374.5,...,1673.5,1695.5,1711.0,1675.0,1690.0,1697.0,1721.0,1660.0,124329,1656.0
2,1156.0,1333.0,1386.0,1163.0,1036.5,1157.0,1076.5,1151.5,1151.5,1149.5,...,1448.5,1470.5,1486.0,1450.0,1465.0,1472.0,1496.0,1435.0,199193,1436.0
3,1154.0,1331.0,1384.0,1161.0,1034.5,1155.0,1074.5,1149.5,1149.5,1147.5,...,1446.5,1468.5,1484.0,1448.0,1463.0,1470.0,1494.0,1433.0,199009,1436.0
4,1416.0,1593.0,1646.0,1423.0,1296.5,1417.0,1336.5,1411.5,1411.5,1409.5,...,1708.5,1730.5,1746.0,1710.0,1725.0,1732.0,1761.0,1695.0,83307,1696.0


In [3]:
X = data_df.drop(['person_id', 'birth_year'], axis=1)
y = data_df['birth_year']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

## Main

n_estimators = 200

RMSE: 35.287644, MAE: 10.521720, R2: 0.991019

Takes 50.1s


---

n_estimators = 300

Takes 1m 14.5s

RMSE: 35.260657, MAE: 10.487344, R2: 0.991033

---

n_estimators = 500

Takes 2m 9.5s

RMSE: 35.275970, MAE: 10.465011, R2: 0.991025

---

n_estimators = 1500

Takes 6m

RMSE: 35.380584, MAE: 10.561264, R2: 0.990972

In [4]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 1.0, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 200)

xg_reg.fit(X_train, y_train)

preds = xg_reg.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, preds))

print("RMSE: %f" % (rmse))

mae = mean_absolute_error(y_test, preds)

print("MAE: %f" % (mae))

r2 = r2_score(y_test, preds)

print("R2: %f" % (r2))


RMSE: 35.287644
MAE: 10.521720
R2: 0.991019


In [5]:
# Save model
xg_reg.save_model('index_year_model.json')

## Find the best hyperparameters using GridSearchCV

RMSE: 41.246865

MAE: 13.085641

R2: 0.987730

Best hyperparameters:

{'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}

Best score:

-1822.656004540281

It takes 54m 45.8s

In [6]:

# # Find the best hyperparameters using GridSearchCV
# from sklearn.impute import SimpleImputer
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import GridSearchCV

# # Define preprocessing pipeline
# preprocessor = Pipeline([
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler())
# ])

# # Apply preprocessing to X_train and X_test
# X_train_processed = preprocessor.fit_transform(X_train)
# X_test_processed = preprocessor.transform(X_test)

# # Define XGBoost regressor
# xg_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=123)

# # Define hyperparameters grid for GridSearchCV
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'learning_rate': [0.05, 0.1, 0.2],
#     'max_depth': [3, 5, 7],
#     'colsample_bytree': [0.3, 0.6, 0.8]
# }

# # Perform GridSearchCV for hyperparameter tuning
# grid_search = GridSearchCV(estimator=xg_reg, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
# grid_search.fit(X_train_processed, y_train)

# # Get best estimator from grid search
# best_xg_reg = grid_search.best_estimator_

# # Predictions and evaluation
# preds = best_xg_reg.predict(X_test_processed)
# rmse = np.sqrt(mean_squared_error(y_test, preds))
# mae = mean_absolute_error(y_test, preds)
# r2 = r2_score(y_test, preds)

# print("RMSE: %f" % (rmse))
# print("MAE: %f" % (mae))
# print("R2: %f" % (r2))

# # Print the best hyperparameters
# print("Best hyperparameters:")
# print(grid_search.best_params_)
# print("Best score:")
# print(grid_search.best_score_)


In [7]:
# Read validation data
# validation_df = pd.read_csv('validation_set.csv', encoding='utf-8-sig')
# print(validation_df.shape)
# validation_df.head()

In [8]:
# calculate RMSE, MAE and R2 for validation set
# X_validation = validation_df.drop(['person_id', 'birth_year'], axis=1)
# y_validation = validation_df['birth_year']

# preds_validation = xg_reg.predict(X_validation)

# rmse_validation = np.sqrt(mean_squared_error(y_validation, preds_validation))

# print("RMSE: %f" % (rmse_validation))

# mae_validation = mean_absolute_error(y_validation, preds_validation)

# print("MAE: %f" % (mae_validation))

# r2_validation = r2_score(y_validation, preds_validation)

# print("R2: %f" % (r2_validation))