In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from autogluon.tabular import TabularPredictor
import joblib

In [17]:
df = pd.read_csv('openpowerlifting.csv')

# Usuwamy tylko to, co naprawdę nieprzydatne do analizy
to_drop = ['MeetID', 'Division', 'Place', 'Squat4Kg', 'Bench4Kg', 'Deadlift4Kg']
df.drop(columns=[c for c in to_drop if c in df.columns], inplace=True)

# Mapowanie płci na wartości numeryczne
df['Sex'] = df['Sex'].map({'F': 0, 'M': 1})

# Usuwanie duplikatów
df.drop_duplicates(inplace=True)

In [18]:
df.head()

Unnamed: 0,Name,Sex,Equipment,Age,BodyweightKg,WeightClassKg,BestSquatKg,BestBenchKg,BestDeadliftKg,TotalKg,Wilks
0,Angie Belk Terry,0,Wraps,47.0,59.6,60.0,47.63,20.41,70.31,138.35,155.05
1,Dawn Bogart,0,Single-ply,42.0,58.51,60.0,142.88,95.25,163.29,401.42,456.38
3,Dawn Bogart,0,Raw,42.0,58.51,60.0,,95.25,,95.25,108.29
4,Destiny Dula,0,Raw,18.0,63.68,67.5,,31.75,90.72,122.47,130.47
5,Courtney Norris,0,Wraps,28.0,62.41,67.5,170.1,77.11,145.15,392.36,424.4


In [19]:
features_age = ['Sex', 'BodyweightKg', 'BestSquatKg', 'BestBenchKg', 'BestDeadliftKg', 'TotalKg']

df_known_age = df[df['Age'].notnull()].dropna(subset=features_age)
df_missing_age = df[df['Age'].isnull()].dropna(subset=features_age)

X_train_age = df_known_age[features_age]
y_train_age = df_known_age['Age']

reg_age = LinearRegression()
reg_age.fit(X_train_age, y_train_age)

predicted_ages = reg_age.predict(df_missing_age[features_age])
df['ImputedAge'] = df['Age']
df.loc[df_missing_age.index, 'ImputedAge'] = predicted_ages

joblib.dump(reg_age, 'age_imputer_model.pkl')

['age_imputer_model.pkl']

In [20]:
bins = [0, 12, 16, 18, 22, 25, 30, 35, 40, 45, 50, 60, 70, 85, 100]
labels = ['0-11y', '12-15y', '16-17y', '18-21y', '22-24y', '25-29y', '30-34y', '35-39y', '40-44y', '45-49y', '50-59y', '60-69y', '70-84y', '85-100y']

df['AgeGroup'] = pd.cut(df['ImputedAge'], bins=bins, labels=labels, right=False)
df['AgeGroup'] = df['AgeGroup'].cat.add_categories('Unknown').fillna('Unknown')

# Usuwamy wiersze, gdzie wiek nadal jest nieznany
df = df[df['AgeGroup'] != 'Unknown']


In [21]:
target = 'Wilks'
features_wilks = ['Sex', 'BodyweightKg', 'BestSquatKg', 'BestBenchKg', 'BestDeadliftKg', 'TotalKg']

# Przygotowanie danych - usuwamy NaN w kolumnie celowej
df_clean = df.dropna(subset=[target])
df_clean = df_clean[features_wilks + [target]]

# Podział
train_data = df_clean.sample(frac=0.8, random_state=42)
test_data = df_clean.drop(train_data.index)

# Trening
predictor = TabularPredictor(label=target).fit(train_data)

No path specified. Models will be saved in: "AutogluonModels/ag-20260118_124416"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.10.11
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 25.2.0: Tue Nov 18 21:09:55 PST 2025; root:xnu-12377.61.12~1/RELEASE_ARM64_T8103
CPU Count:          8
Pytorch Version:    2.9.1
CUDA Version:       CUDA is not available
Memory Avail:       1.57 GB / 8.00 GB (19.6%)
Disk Space Avail:   19.53 GB / 228.27 GB (8.6%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='extreme'  : New in v1.5: The state-of-the-art for tabular data. Massively better than 'best' on datasets <100000 samples by using new Tabular Foundation Models (TFMs) meta-learned on https://tab

[1000]	valid_set's rmse: 2.87634
[2000]	valid_set's rmse: 2.48418
[3000]	valid_set's rmse: 2.31666
[4000]	valid_set's rmse: 2.20185
[5000]	valid_set's rmse: 2.13222
[6000]	valid_set's rmse: 2.0692
[7000]	valid_set's rmse: 2.02892
[8000]	valid_set's rmse: 1.99394
[9000]	valid_set's rmse: 1.96614
[10000]	valid_set's rmse: 1.93991


	-1.9399	 = Validation score   (-root_mean_squared_error)
	65.58s	 = Training   runtime
	0.73s	 = Validation runtime
Fitting model: LightGBM ...
	Fitting with cpus=8, gpus=0, mem=0.1/2.2 GB


[1000]	valid_set's rmse: 2.08864
[2000]	valid_set's rmse: 1.86368
[3000]	valid_set's rmse: 1.77042
[4000]	valid_set's rmse: 1.71034
[5000]	valid_set's rmse: 1.67836
[6000]	valid_set's rmse: 1.66208
[7000]	valid_set's rmse: 1.65458
[8000]	valid_set's rmse: 1.63891
[9000]	valid_set's rmse: 1.62913
[10000]	valid_set's rmse: 1.63962


	-1.6283	 = Validation score   (-root_mean_squared_error)
	51.89s	 = Training   runtime
	0.38s	 = Validation runtime
Fitting model: RandomForestMSE ...
		To set the same value for all models, do the following when calling predictor.fit: `predictor.fit(..., ag_args_fit={"ag.max_memory_usage_ratio": VALUE})`
		Setting "ag.max_memory_usage_ratio" to values above 1 may result in out-of-memory errors. You may consider using a machine with more memory as a safer alternative.
	Fitting with cpus=8, gpus=0, mem=1.1/2.3 GB
	-1.3365	 = Validation score   (-root_mean_squared_error)
	15.87s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: CatBoost ...
	Fitting with cpus=8, gpus=0
	-1.577	 = Validation score   (-root_mean_squared_error)
	44.94s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	To force training the model, specify the model hyperparameter "ag.max_memory_usage_ratio" to a larger value (currently 1.0, set to >=1.09 to avoid the error)

[1000]	valid_set's rmse: 1.76956
[2000]	valid_set's rmse: 1.62816
[3000]	valid_set's rmse: 1.59344
[4000]	valid_set's rmse: 1.56341
[5000]	valid_set's rmse: 1.54229
[6000]	valid_set's rmse: 1.52434
[7000]	valid_set's rmse: 1.51321
[8000]	valid_set's rmse: 1.50805
[9000]	valid_set's rmse: 1.50528
[10000]	valid_set's rmse: 1.50134


	-1.5013	 = Validation score   (-root_mean_squared_error)
	176.01s	 = Training   runtime
	0.99s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Fitting 1 model on all data | Fitting with cpus=8, gpus=0, mem=0.0/2.0 GB
	Ensemble Weights: {'RandomForestMSE': 0.5, 'CatBoost': 0.273, 'LightGBMLarge': 0.227}
	-0.9841	 = Validation score   (-root_mean_squared_error)
	0.01s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 395.22s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 2417.1 rows/s (2498 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("/Users/dominikpiwowarczyk/Desktop/SUMLProjekt/SUML2025-6/AutogluonModels/ag-20260118_124416")


In [24]:
performance = predictor.evaluate(test_data)
print(performance)

# Test na nowych danych
new_data = pd.DataFrame({
    'Sex': [1],
    'BodyweightKg': [72.0],
    'BestSquatKg': [154.0],
    'BestBenchKg': [116.0],
    'BestDeadliftKg': [184.0],
    'TotalKg': [454.0]
})

predicted_wilks = predictor.predict(new_data)
print(f"Przewidywany Wilks: {predicted_wilks[0]:.2f}")

{'root_mean_squared_error': np.float64(-1.3731866006821103), 'mean_squared_error': -1.8856414402928894, 'mean_absolute_error': -0.4092736794747087, 'r2': 0.9998113342295244, 'pearsonr': 0.9999057412094321, 'median_absolute_error': -0.23370666503905113}
Przewidywany Wilks: 333.38
