In [1]:
PIPELINE_CONTEXT = {"_runtime_context_name": "PIPELINE_CONTEXT", "params": {"n_clusters": "auto", "n_clusters_min": 6, "n_clusters_max": 50, "n_clusters_criterion": "silhouette", "pca_components": "auto", "pca_variance": 0.9, "pca_max_components": 50, "random_state": 42}, "artifacts": {"dir": "/Users/savvasneofytou/Desktop/basketball/artifacts/run_20250828_021400", "processed_data": "/Users/savvasneofytou/Desktop/basketball/artifacts/run_20250828_021400/processed.parquet", "model_file": "/Users/savvasneofytou/Desktop/basketball/artifacts/run_20250828_021400/kmeans_model.joblib", "scaler_file": "/Users/savvasneofytou/Desktop/basketball/artifacts/run_20250828_021400/scaler.joblib", "pca_file": "/Users/savvasneofytou/Desktop/basketball/artifacts/run_20250828_021400/pca.joblib", "cluster_summary": "/Users/savvasneofytou/Desktop/basketball/artifacts/run_20250828_021400/cluster_summary.json", "elbow_plot": "/Users/savvasneofytou/Desktop/basketball/artifacts/run_20250828_021400/elbow_plot.png", "silhouette_plot": "/Users/savvasneofytou/Desktop/basketball/artifacts/run_20250828_021400/silhouette_plot.png", "db_plot": "/Users/savvasneofytou/Desktop/basketball/artifacts/run_20250828_021400/db_plot.png", "ch_plot": "/Users/savvasneofytou/Desktop/basketball/artifacts/run_20250828_021400/ch_plot.png", "selection": "/Users/savvasneofytou/Desktop/basketball/artifacts/run_20250828_021400/selection.json"}, "cwd": "/Users/savvasneofytou/Desktop/basketball/pipeline", "data_dir": "/Users/savvasneofytou/Desktop/basketball/data/output_by_college_clean"}

**Pipeline Context**

In [2]:
try:
    ctx = PIPELINE_CONTEXT
except NameError:
    import os
    ctx = {
        "params": {},
        "artifacts": {
            "dir": "artifacts",
            "processed_data": "artifacts/processed.parquet",
            "model_file": "artifacts/kmeans_model.joblib",
            "scaler_file": "artifacts/scaler.joblib",
            "pca_file": "artifacts/pca.joblib",
            "cluster_summary": "artifacts/cluster_summary.json",
            "elbow_plot": "artifacts/elbow_plot.png",
            "silhouette_plot": "artifacts/silhouette_plot.png",
            "selection": "artifacts/selection.json",
        },
        "data_dir": "data",
    }

from pathlib import Path
import json

P = ctx.get("params", {})
PATHS = ctx.get("artifacts", {})
DATA_DIR = ctx.get("data_dir", "data")
Path(PATHS["dir"]).mkdir(parents=True, exist_ok=True)

def save_json(obj, path):
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w") as f:
        json.dump(obj, f, indent=2, default=float)

def load_json(path, default=None):
    p = Path(path)
    if p.exists():
        return json.loads(p.read_text())
    return {} if default is None else default


**Imports**

In [3]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


**Data Importing**

In [4]:
# Load the dataset
data = pd.read_csv('player_features_cleaned.csv')

# Basic checks
print("Shape:", data.shape)
print("\nColumns:\n", data.columns.tolist())
print("\nData types & non‐null counts:")
print(data.info())

# Quick summary statistics
print("\nSummary statistics:")
print(data.describe().T)

# Check for missing values
missing = data.isnull().sum()
print("\nMissing values per column:")
if missing.sum() == 0:
    print("No missing values found.")
else:
    print(missing[missing > 0])


Shape: (2390, 81)

Columns:
 ['college', 'season', 'player_number_ind', 'player_ind', 'gp_ind', 'gs_ind', 'minutes_tot_ind', 'scoring_pts_ind', 'rebounds_tot_ind', 'ast_ind', 'stl_ind', 'blk_ind', 'to_ind', 'pts_per40', 'reb_per40', 'ast_per40', 'stl_per40', 'blk_per40', 'to_per40', 'eFG_pct', 'TS_pct', 'USG_pct', 'ORB_pct', 'DRB_pct', 'AST_pct', 'AST_per_TO', '3pt_3pt_pct_ind', 'three_per40', 'threeA_per40', 'three_per100', 'threeA_rate', 'DRCR', 'STL_TO_ratio', 'def_stops_per100', 'DPMR', 'TUSG_pct', 'Gravity', 'PPT', 'Spacing', 'Assist_to_Usage', 'APC', 'PEF', 'OEFF', 'TOV_pct', 'SEM', 'PEI', 'BoxCreation', 'OLI', 'IPM', 'threeA_per100', '2pt_pct', 'FTr', 'PPP', 'possessions', 'scoring_pts_per100', 'ast_per100', 'rebounds_tot_per100', 'stl_per100', 'blk_per100', 'to_per100', 'mins_per_game', 'pts_per_game', 'ast_per_game', 'reb_per_game', 'stl_per_game', 'blk_per_game', 'to_per_game', 'scoring_pts_share', 'ast_share', 'rebounds_tot_share', 'stl_share', 'blk_share', 'to_share', 'team

**Histograms, Density Plots and Box Plots**

In [5]:
# Histograms & density plots for selected features
features = [
    'pts_per40','eFG_pct','AST_per_TO','reb_per40',
    'stl_per40','blk_per40','FTr','PPP', 'threeA_rate','threeA_per40',
    'three_per40','three_per100'
]

fig, axes = plt.subplots(len(features), 2, figsize=(12, 4*len(features)))

for ax_row, col in zip(axes, features):
    sns.histplot(data[col], bins=30, ax=ax_row[0], kde=False)
    ax_row[0].set_title(f"{col} – Histogram")
    
    sns.kdeplot(data[col], fill=True, ax=ax_row[1])
    ax_row[1].set_title(f"{col} – Density")

plt.tight_layout()


In [6]:
# Boxplots to spot outliers in the same features
fig, axes = plt.subplots(1, len(features), figsize=(4*len(features), 4), sharey=True)

for ax, col in zip(axes, features):
    sns.boxplot(x=data[col], ax=ax)
    ax.set_title(f"{col} – Boxplot")

plt.tight_layout()


**Data Smoothing and Manipulation**

In [7]:
# Make a working copy
df_clean = data.copy()
print("Starting shape:", df_clean.shape)


Starting shape: (2390, 81)


In [8]:
# Filter out very low-minute players 
# Drop any season‐rows where total minutes < 250 (common theresholdused by basketball analysts)
df_clean = df_clean[df_clean['minutes_tot_ind'].astype(int) >= 250].reset_index(drop=True)
print("After minutes filter:", df_clean.shape)


After minutes filter: (1406, 81)


In [9]:
# Log1p‐transform heavily skewed volume stats 
skew_cols = ['pts_per40','reb_per40','stl_per40','blk_per40','FTr','PPP', 'threeA_rate','threeA_per40','three_per40','three_per100']
for c in skew_cols:
    df_clean[c] = np.log1p(df_clean[c])
print("Applied log1p to:", skew_cols)


Applied log1p to: ['pts_per40', 'reb_per40', 'stl_per40', 'blk_per40', 'FTr', 'PPP', 'threeA_rate', 'threeA_per40', 'three_per40', 'three_per100']


In [10]:
# Impute NaNs for rate/stat columns 
# Treat zero‐attempt players as missing, then fill with median
rate_cols = ['eFG_pct','TS_pct','AST_per_TO']
for c in rate_cols:
    df_clean.loc[df_clean[c] == 0, c] = np.nan
    median = df_clean[c].median()
    df_clean[c] = df_clean[c].fillna(median)
print("Imputed zeros->NaN, then filled medians for:", rate_cols)


Imputed zeros->NaN, then filled medians for: ['eFG_pct', 'TS_pct', 'AST_per_TO']


In [11]:
# Scale all numeric features (excluding player_number_ind)
# get all numeric columns except the ID
numeric_cols = [
    c for c in df_clean.select_dtypes(include=[np.number]).columns
    if c not in {'player_number_ind', 'player_id'}
]

# fit & transform only those
scaler = StandardScaler()
df_clean[numeric_cols] = scaler.fit_transform(df_clean[numeric_cols])

# Save the scaler
joblib.dump(scaler, PATHS["scaler_file"])
print("Saved Scaler model to:", PATHS["scaler_file"])

print("Scaled numeric columns (player_number_ind kept intact):")
print(numeric_cols)
df_clean.head()


Saved Scaler model to: /Users/savvasneofytou/Desktop/basketball/artifacts/run_20250828_021400/scaler.joblib
Scaled numeric columns (player_number_ind kept intact):
['gp_ind', 'gs_ind', 'minutes_tot_ind', 'scoring_pts_ind', 'rebounds_tot_ind', 'ast_ind', 'stl_ind', 'blk_ind', 'to_ind', 'pts_per40', 'reb_per40', 'ast_per40', 'stl_per40', 'blk_per40', 'to_per40', 'eFG_pct', 'TS_pct', 'USG_pct', 'ORB_pct', 'DRB_pct', 'AST_pct', 'AST_per_TO', '3pt_3pt_pct_ind', 'three_per40', 'threeA_per40', 'three_per100', 'threeA_rate', 'DRCR', 'STL_TO_ratio', 'def_stops_per100', 'DPMR', 'TUSG_pct', 'Gravity', 'PPT', 'Spacing', 'Assist_to_Usage', 'APC', 'PEF', 'OEFF', 'TOV_pct', 'SEM', 'PEI', 'BoxCreation', 'OLI', 'IPM', 'threeA_per100', '2pt_pct', 'FTr', 'PPP', 'possessions', 'scoring_pts_per100', 'ast_per100', 'rebounds_tot_per100', 'stl_per100', 'blk_per100', 'to_per100', 'mins_per_game', 'pts_per_game', 'ast_per_game', 'reb_per_game', 'stl_per_game', 'blk_per_game', 'to_per_game', 'scoring_pts_share',

Unnamed: 0,college,season,player_number_ind,player_ind,gp_ind,gs_ind,minutes_tot_ind,scoring_pts_ind,rebounds_tot_ind,ast_ind,...,blk_share,to_share,team_TS_pct,TS_diff,ast_per_fgm,tov_rate,game_score,game_score_per40,min_share,player_id
0,many_seasons,2017-18,35,Fournier Toby,1.08619,-1.411959,0.110858,1.93227,1.26096,-0.939509,...,1.221714,0.208486,1.059275,0.996571,-0.928354,-1.018024,1.957189,2.805084,0.065659,0
1,many_seasons,2017-18,3,Jackson Ashlon,1.469808,1.526041,1.623081,1.916469,-0.34221,0.919878,...,-0.534653,-0.174507,1.059275,-0.069276,-0.313545,-1.451692,1.253363,0.465753,1.626308,1
2,many_seasons,2017-18,5,Okananwa Oluchi,1.469808,-1.411959,0.616302,1.237025,1.424548,0.54219,...,-0.42488,0.323384,1.059275,0.646524,-0.366501,-0.61625,1.683116,1.746039,0.685163,2
3,many_seasons,2017-18,24,Richardson Reigan,1.469808,1.526041,1.343648,1.110617,-0.096827,1.23946,...,-0.369994,0.897875,1.059275,-0.937572,-0.127927,-0.495772,0.334195,-0.269382,1.365092,3
4,many_seasons,2017-18,12,Thomas Delaney,1.469808,1.526041,0.488913,0.360069,1.048294,-0.35845,...,-0.42488,-0.557501,1.059275,0.86409,-0.582595,-0.718581,0.945731,0.976259,0.56608,4


In [12]:
# final shape check
print("Final shape after all cleaning steps:", df_clean.shape)

Final shape after all cleaning steps: (1406, 81)


Visualise Density and Histrogram plots again

In [13]:
# Histograms & density plots for selected features
features = [
    'pts_per40','eFG_pct','AST_per_TO','reb_per40',
    'stl_per40','blk_per40','FTr','PPP', 'threeA_rate','threeA_per40','three_per40','three_per100'
]

fig, axes = plt.subplots(len(features), 2, figsize=(12, 4*len(features)))

for ax_row, col in zip(axes, features):
    sns.histplot(df_clean[col], bins=30, ax=ax_row[0], kde=False)
    ax_row[0].set_title(f"{col} – Histogram")
    
    sns.kdeplot(df_clean[col], fill=True, ax=ax_row[1])
    ax_row[1].set_title(f"{col} – Density")

plt.tight_layout()


In [14]:
# Boxplots to spot outliers in the same features
fig, axes = plt.subplots(1, len(features), figsize=(4*len(features), 4), sharey=True)

for ax, col in zip(axes, features):
    sns.boxplot(x=df_clean[col], ax=ax)
    ax.set_title(f"{col} – Boxplot")

plt.tight_layout()


**Export Cleaned Dataset**


In [15]:
#export clean dataframe
df_clean.to_csv('df_clean.csv', index=False)
print("df_clean.csv saved, shape:", df_clean.shape)
print("df_clean head:")
print(df_clean.head())

df_clean.csv saved, shape: (1406, 81)
df_clean head:
        college   season  player_number_ind         player_ind    gp_ind  \
0  many_seasons  2017-18                 35      Fournier Toby  1.086190   
1  many_seasons  2017-18                  3     Jackson Ashlon  1.469808   
2  many_seasons  2017-18                  5    Okananwa Oluchi  1.469808   
3  many_seasons  2017-18                 24  Richardson Reigan  1.469808   
4  many_seasons  2017-18                 12     Thomas Delaney  1.469808   

     gs_ind  minutes_tot_ind  scoring_pts_ind  rebounds_tot_ind   ast_ind  \
0 -1.411959         0.110858         1.932270          1.260960 -0.939509   
1  1.526041         1.623081         1.916469         -0.342210  0.919878   
2 -1.411959         0.616302         1.237025          1.424548  0.542190   
3  1.526041         1.343648         1.110617         -0.096827  1.239460   
4  1.526041         0.488913         0.360069          1.048294 -0.358450   

   ...  blk_share  to_share

**Correlation Screening**

In [16]:
# Correlation Heatmap — full set before dropping
plt.figure(figsize=(14,12))
corr_full = df_clean.select_dtypes(include=[np.number]).corr()
mask = np.triu(np.ones_like(corr_full, dtype=bool))
sns.heatmap(
    corr_full,
    mask=mask,
    cmap='vlag',
    center=0,
    annot=False,
    linewidths=0.3
)
plt.title("Full Feature Correlation Matrix")
plt.tight_layout()


In [17]:
# Greedy Correlation Drop

# threshold
thresh = 0.90

# compute abs corr matrix on your scaled df_clean
numeric = df_clean.select_dtypes(include=[np.number])
corr    = numeric.corr().abs()

# list to hold columns to drop
to_drop = set()

# ordered list of columns
cols = corr.columns.tolist()

# greedy loop: for each (i,j) with j>i, if corr>thresh and j not yet marked, drop j
for i, col_i in enumerate(cols):
    if col_i in to_drop:
        continue
    for col_j in cols[i+1:]:
        if col_j in to_drop:
            continue
        if corr.loc[col_i, col_j] > thresh:
            to_drop.add(col_j)

print(f"Dropping {len(to_drop)} columns:\n", sorted(to_drop))

# now drop them
df_corr = df_clean.drop(columns=to_drop).copy()
print("Shape after greedy drop:", df_corr.shape)


Dropping 26 columns:
 ['APC', 'AST_per_TO', 'BoxCreation', 'Gravity', 'PEF', 'Spacing', 'TS_diff', 'TS_pct', 'ast_per_game', 'ast_share', 'blk_per100', 'blk_per_game', 'blk_share', 'min_share', 'possessions', 'pts_per_game', 'reb_per_game', 'rebounds_tot_share', 'scoring_pts_share', 'stl_per_game', 'threeA_per100', 'threeA_per40', 'three_per100', 'to_per_game', 'to_share', 'tov_rate']
Shape after greedy drop: (1406, 55)


In [18]:
# Correlation Heatmap — after dropping high-corr
plt.figure(figsize=(14,12))
corr_reduced = df_corr.select_dtypes(include=[np.number]).corr()
mask = np.triu(np.ones_like(corr_reduced, dtype=bool))
sns.heatmap(
    corr_reduced,
    mask=mask,
    cmap='vlag',
    center=0,
    annot=False,
    linewidths=0.3
)
plt.title("Reduced Feature Correlation Matrix")
plt.tight_layout()


**Principal Component Analysis (PCA)**

In [19]:
# Fit a full PCA on the filtered & scaled features to inspect variance explained

# identify any identifier columns to exclude from PCA
id_cols = ['player_number_ind', 'player_ind', 'player_id']

# pick numeric feature columns (exclude identifiers)
feat_cols = [
    c for c in df_corr.select_dtypes(include=[np.number]).columns
    if c not in id_cols
]

# fit PCA with as many components as features
pca_full = PCA().fit(df_corr[feat_cols])

# cumulative explained variance
cumvar = np.cumsum(pca_full.explained_variance_ratio_)

# find how many PCs give you at least 90% of variance
n_pc = np.argmax(cumvar >= 0.90) + 1

target = float(P.get("pca_variance", 0.90))
maxc   = int(P.get("pca_max_components", 25))
n_pc   = max(1, min(int(n_pc), maxc, len(feat_cols)))

selected = load_json(PATHS["selection"], default={})

pca_param = P.get("pca_components", "auto")
if isinstance(pca_param, int):
    # fixed override
    selected["n_pca"] = int(pca_param)
else:
    # use the value your explore notebook selected
    selected["n_pca"] = int(n_pc)  

save_json(selected, PATHS["selection"])

print("Saved n_pca to:", PATHS["selection"], "->", selected["n_pca"])

print(f"Number of components for ≥90% variance: {n_pc}")

Saved n_pca to: /Users/savvasneofytou/Desktop/basketball/artifacts/run_20250828_021400/selection.json -> 10
Number of components for ≥90% variance: 10


In [20]:
# Plot the cumulative explained‐variance curve (the “elbow” for PCA)

plt.figure(figsize=(8,5))
plt.plot(
    np.arange(1, len(cumvar)+1),
    cumvar,
    marker='o',
    linestyle='-'
)
plt.axhline(0.90, color='red', linestyle='--')
plt.xlabel("Number of Principal Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("PCA Explained Variance Curve")
plt.tight_layout()
plt.show()


  plt.show()


In [21]:
# Transform your data into the top n_pc principal components

# re‐fit PCA with the chosen number of components
pca = PCA(n_components=n_pc)
pcs = pca.fit_transform(df_corr[feat_cols])

# Save the PCA model
joblib.dump(pca, PATHS["pca_file"])
print("Saved PCA model to:", PATHS["pca_file"])

# build a DataFrame of those PCs
pc_cols = [f"PC{i+1}" for i in range(n_pc)]
df_pca = pd.DataFrame(pcs, columns=pc_cols, index=df_corr.index)

# bring along your identifier columns
df_pca[id_cols] = df_corr[id_cols].values

print("df_pca ready with shape:", df_pca.shape)
df_pca.head()


Saved PCA model to: /Users/savvasneofytou/Desktop/basketball/artifacts/run_20250828_021400/pca.joblib
df_pca ready with shape: (1406, 13)


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,player_number_ind,player_ind,player_id
0,5.414892,-5.735299,-1.074703,1.883578,1.785524,-1.485886,0.764743,1.777591,0.642698,0.863351,35,Fournier Toby,0
1,2.37927,3.482239,-3.359357,1.986833,-1.066208,0.266259,-0.227233,1.973263,-0.571327,-0.069808,3,Jackson Ashlon,1
2,4.451006,0.051315,1.218953,2.355106,2.050503,1.721564,-0.179605,1.175182,0.589499,1.687186,5,Okananwa Oluchi,2
3,1.908036,2.230455,-2.139915,-1.170716,-0.551957,-0.660786,-0.255022,2.953049,-0.618413,-0.418577,24,Richardson Reigan,3
4,3.408698,-1.691492,3.494235,2.132962,-0.364642,2.141278,-2.995035,1.520047,-0.719133,-0.098029,12,Thomas Delaney,4


PCA Validation and Checks

In [22]:
# Tabulate explained‐variance ratios
evr = pca_full.explained_variance_ratio_
cumvar = np.cumsum(evr)

var_df = pd.DataFrame({
    'PC':           [f'PC{i+1}' for i in range(len(evr))],
    'ExplainedVar': evr,
    'CumulativeVar': cumvar
})

print("Explained‐variance ratios:")
display(var_df.style.format({
    'ExplainedVar': '{:.3f}',
    'CumulativeVar':'{:.3f}'
}))


Explained‐variance ratios:


Unnamed: 0,PC,ExplainedVar,CumulativeVar
0,PC1,0.273,0.273
1,PC2,0.173,0.446
2,PC3,0.126,0.573
3,PC4,0.112,0.685
4,PC5,0.062,0.747
5,PC6,0.053,0.8
6,PC7,0.037,0.837
7,PC8,0.032,0.869
8,PC9,0.021,0.89
9,PC10,0.017,0.907


In [23]:
# Confirm PCs are uncorrelated
pc_corr = df_pca[pc_cols].corr().round(3)
print("Correlation matrix of PCs (should be ≈identity):")
display(pc_corr)


Correlation matrix of PCs (should be ≈identity):


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
PC1,1.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0
PC2,-0.0,1.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0
PC3,0.0,-0.0,1.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,0.0
PC4,0.0,-0.0,-0.0,1.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0
PC5,-0.0,-0.0,0.0,-0.0,1.0,-0.0,-0.0,-0.0,-0.0,0.0
PC6,-0.0,-0.0,0.0,-0.0,-0.0,1.0,0.0,-0.0,0.0,0.0
PC7,-0.0,-0.0,0.0,0.0,-0.0,0.0,1.0,-0.0,-0.0,0.0
PC8,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,1.0,0.0,0.0
PC9,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,1.0,0.0
PC10,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [24]:
# Inspect feature loadings for the first two PCs
loadings = pd.DataFrame(
    pca.components_.T,
    index=feat_cols,
    columns=pc_cols
)

for pc in pc_cols[:2]:
    print(f"Top 8 absolute loadings for {pc}:")
    display(
        loadings[pc]
         .abs()
         .sort_values(ascending=False)
         .head(8)
         .rename_axis("feature")
         .reset_index(name="abs_loading")
    )


Top 8 absolute loadings for PC1:


Unnamed: 0,feature,abs_loading
0,game_score,0.248539
1,scoring_pts_ind,0.237727
2,def_stops_per100,0.231123
3,rebounds_tot_ind,0.229132
4,minutes_tot_ind,0.215624
5,TUSG_pct,0.21443
6,game_score_per40,0.204833
7,OEFF,0.199891


Top 8 absolute loadings for PC2:


Unnamed: 0,feature,abs_loading
0,OLI,0.278762
1,AST_pct,0.273843
2,ast_ind,0.261386
3,ast_per40,0.252724
4,ast_per100,0.245345
5,reb_per40,0.217191
6,Assist_to_Usage,0.209234
7,ast_per_fgm,0.206045


In [25]:
# Quick PC1 vs PC2 scatter
plt.figure(figsize=(6,5))
sns.scatterplot(x='PC1', y='PC2', data=df_pca, s=30, alpha=0.6)
plt.title("PC1 vs PC2 (sanity check)")
plt.xlabel("PC1"); plt.ylabel("PC2")
plt.tight_layout()
plt.show()


  plt.show()


**Export PCA Dataset to CSV**

In [26]:
#export pca dataframe
df_pca.to_csv('df_pca.csv', index=True)
print("df_pca.csv saved, shape:", df_pca.shape)
print("df_pca head:")
print(df_pca.head())

df_pca.csv saved, shape: (1406, 13)
df_pca head:
        PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
0  5.414892 -5.735299 -1.074703  1.883578  1.785524 -1.485886  0.764743   
1  2.379270  3.482239 -3.359357  1.986833 -1.066208  0.266259 -0.227233   
2  4.451006  0.051315  1.218953  2.355106  2.050503  1.721564 -0.179605   
3  1.908036  2.230455 -2.139915 -1.170716 -0.551957 -0.660786 -0.255022   
4  3.408698 -1.691492  3.494235  2.132962 -0.364642  2.141278 -2.995035   

        PC8       PC9      PC10 player_number_ind         player_ind player_id  
0  1.777591  0.642698  0.863351                35      Fournier Toby         0  
1  1.973263 -0.571327 -0.069808                 3     Jackson Ashlon         1  
2  1.175182  0.589499  1.687186                 5    Okananwa Oluchi         2  
3  2.953049 -0.618413 -0.418577                24  Richardson Reigan         3  
4  1.520047 -0.719133 -0.098029                12     Thomas Delaney         4  
