In [5]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
import numpy as np

In [6]:
data_df = pd.read_csv('data_interpolated.csv', encoding='utf-8-sig')
print(data_df.shape)
data_df.head()

(31377, 638)


Unnamed: 0,k_2_by,k_4_by,k_6_by,k_8_by,k_22_by,k_23_by,k_24_by,k_25_by,k_26_by,k_28_by,...,e_323,e_326,e_329,e_330,e_332,e_334,e_343,person_id,birth_year,death_year
0,1480.0,1650.0,1703.0,1480.0,1353.5,1474.0,1393.5,1468.5,1468.5,1466.5,...,1773.5,1765.5,1787.5,1803.0,1767.0,1782.0,1787.0,85175,1752.0,1824.0
1,1384.0,1554.0,1607.0,1384.0,1257.5,1378.0,1297.5,1372.5,1372.5,1370.5,...,1677.5,1669.5,1691.5,1707.0,1671.0,1686.0,1691.0,124329,1656.0,1717.0
2,1164.0,1334.0,1387.0,1164.0,1037.5,1158.0,1077.5,1152.5,1152.5,1150.5,...,1457.5,1449.5,1471.5,1487.0,1451.0,1466.0,1471.0,199193,1436.0,1497.0
3,1164.0,1334.0,1387.0,1164.0,1037.5,1158.0,1077.5,1152.5,1152.5,1150.5,...,1457.5,1449.5,1471.5,1487.0,1451.0,1466.0,1471.0,199009,1436.0,1497.0
4,1424.0,1594.0,1647.0,1424.0,1297.5,1418.0,1337.5,1412.5,1412.5,1410.5,...,1717.5,1709.5,1731.5,1747.0,1711.0,1726.0,1731.0,83307,1696.0,1761.0


In [7]:
X = data_df.drop(['person_id', 'birth_year'], axis=1)
y = data_df['birth_year']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)


xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.8, learning_rate = 0.2,
                max_depth = 7, alpha = 10, n_estimators = 200)

xg_reg.fit(X_train, y_train)

preds = xg_reg.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, preds))

print("RMSE: %f" % (rmse))

mae = mean_absolute_error(y_test, preds)

print("MAE: %f" % (mae))

r2 = r2_score(y_test, preds)

print("R2: %f" % (r2))


RMSE: 1.148062
MAE: 0.461774
R2: 0.999990


In [8]:
# Read validation data
validation_df = pd.read_csv('validation_set.csv', encoding='utf-8-sig')
print(validation_df.shape)
validation_df.head()

(1652, 638)


Unnamed: 0,k_2_by,k_4_by,k_6_by,k_8_by,k_22_by,k_23_by,k_24_by,k_25_by,k_26_by,k_28_by,...,e_323,e_326,e_329,e_330,e_332,e_334,e_343,person_id,birth_year,death_year
0,,,,,,,,,,,...,,,,,,,,62632,1732.0,1784.0
1,,,,,,,,,,,...,,,,,,,,34345,1824.0,1890.0
2,,,,,,,,,,,...,,,,,,,,206202,1542.0,
3,,,,,,,,,,,...,,,,,,,,142346,635.0,698.0
4,,,,,,,,,,,...,,,,,,,,131203,1528.0,1591.0


In [9]:
# calculate RMSE, MAE and R2 for validation set
X_validation = validation_df.drop(['person_id', 'birth_year'], axis=1)
y_validation = validation_df['birth_year']

preds_validation = xg_reg.predict(X_validation)

rmse_validation = np.sqrt(mean_squared_error(y_validation, preds_validation))

print("RMSE: %f" % (rmse_validation))

mae_validation = mean_absolute_error(y_validation, preds_validation)

print("MAE: %f" % (mae_validation))

r2_validation = r2_score(y_validation, preds_validation)

print("R2: %f" % (r2_validation))

RMSE: 1787.060249
MAE: 1740.104950
R2: -21.801090
