In [3]:
import pandas as pd
import numpy as np

df =pd.read_csv("preprocessed_data.csv")

df.loc[df['clean_sheets'] > 0, 'goals_conceded'] = 0

df['goals_conceded'] = (
    df['goals_conceded']
    .round()
    .clip(lower=0, upper=6)
)

df['goals_conceded'] = np.minimum(
    df['goals_conceded'],
    df['nb_on_pitch'] * 10
)

df['clean_sheets'] = (
    df['clean_sheets']
    .round()
    .clip(lower=0, upper=2)
)

df['clean_sheets'] = np.minimum(
    df['clean_sheets'],
    df['nb_on_pitch']
)

# Days missed
# -------------------------------
df['days_missed'] = df['days_missed'].round().clip(lower=0, upper=200)

# -------------------------------
# Games missed
# -------------------------------
df['games_missed'] = df['games_missed'].round().clip(lower=0, upper=5)

df.to_csv("formatted_data.csv", index=False)

In [4]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("formatted_data.csv")

# -------------------------
# Columns to log-transform (will overwrite original columns)
# -------------------------
log_cols = [
    'injury_reason_freq',
    'competition_name_freq',
    'team_name_freq',
    'citizenship_freq',
    'contract_remaining_days'
]

# -------------------------
# Apply log transformation directly to original columns
# -------------------------
for col in log_cols:
    if col in df.columns:
        # Ensure no negative values
        df[col] = df[col].clip(lower=0)
        # Overwrite with log1p
        df[col] = np.log1p(df[col])

# -------------------------
# Save new dataset
# -------------------------
df.to_csv("log_transformed_data.csv", index=False)

print("Log transformation completed (original columns overwritten) and saved as 'log_transformed_data.csv'")


Log transformation completed (original columns overwritten) and saved as 'log_transformed_data.csv'


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib  # for saving scaler objects

# -------------------------
# 1️⃣ Load dataset
# -------------------------
df = pd.read_csv("formatted_data.csv")

# -------------------------
# 2️⃣ Define columns
# -------------------------
target_col = 'value'

id_cols = ['player_id']               # add more IDs if needed
name_cols = ['player_name']           # name/text columns

# Binary + OHE columns (0/1)
binary_ohe_cols = [
    col for col in df.columns
    if df[col].nunique() == 2 and df[col].dropna().isin([0, 1]).all()
]

# -------------------------
# 3️⃣ Separate X and y
# -------------------------
X = df.drop(columns=[target_col])
y = df[target_col]

# -------------------------
# 4️⃣ Select numeric columns to scale
# (exclude IDs, names, OHE, binary)
# -------------------------
num_cols = X.select_dtypes(include=['int64', 'float64']).columns

scale_cols = [
    col for col in num_cols
    if col not in id_cols
    and col not in name_cols
    and col not in binary_ohe_cols
]

# -------------------------
# 5️⃣ Train-test split
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------------------------
# 6️⃣ Scale FEATURES
# -------------------------
X_scaler = StandardScaler()
X_train[scale_cols] = X_scaler.fit_transform(X_train[scale_cols])
X_test[scale_cols] = X_scaler.transform(X_test[scale_cols])

# -------------------------
# 7️⃣ Scale TARGET
# -------------------------
y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))

# Convert back to Series
y_train_scaled = pd.Series(y_train_scaled.flatten(), name=target_col)
y_test_scaled = pd.Series(y_test_scaled.flatten(), name=target_col)

# -------------------------
# 8️⃣ Save scaled datasets
# -------------------------
X_train.to_csv("X_train_scaled.csv", index=False)
X_test.to_csv("X_test_scaled.csv", index=False)
y_train_scaled.to_csv("y_train_scaled.csv", index=False)
y_test_scaled.to_csv("y_test_scaled.csv", index=False)

# -------------------------
# 9️⃣ Save scaler objects
# -------------------------
joblib.dump(X_scaler, "X_scaler.pkl")
joblib.dump(y_scaler, "y_scaler.pkl")

print("✅ Features and target scaled correctly (IDs, OHE, binary untouched)")
print("✅ Scaler objects saved as 'X_scaler.pkl' and 'y_scaler.pkl'")


✅ Features and target scaled correctly (IDs, OHE, binary untouched)
✅ Scaler objects saved as 'X_scaler.pkl' and 'y_scaler.pkl'


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# -------------------------
# 1️⃣ Load dataset
# -------------------------
df = pd.read_csv("formatted_data.csv")

# -------------------------
# 2️⃣ Drop ID and name columns
# -------------------------
cols_to_drop = ["player_id", "player_name"]
df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

# -------------------------
# 3️⃣ Define target and features
# -------------------------
target_col = "value"
y = df[target_col]
X = df.drop(columns=[target_col]).select_dtypes(include=['number'])

# -------------------------
# 4️⃣ Train-test split
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------------------------
# 5️⃣ Scale numeric features
# -------------------------
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# -------------------------
# 6️⃣ Train Linear Regression
# -------------------------
model = LinearRegression(n_jobs=-1)
model.fit(X_train_scaled, y_train)

# -------------------------
# 7️⃣ Predict on test set
# -------------------------
y_pred = model.predict(X_test_scaled)

# -------------------------
# 8️⃣ Evaluate model
# -------------------------
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("R²:", r2)


MSE: 8.164485976856335
RMSE: 2.857356466536217
R²: 0.8318692343035163
