In [31]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mydata/merged_f.xlsx


In [3]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# ==========================================
# 1. LOAD DATA
# ==========================================
df = pd.read_excel('/kaggle/input/mydata/merged_f.xlsx')

# ==========================================
# 2. FEATURE ENGINEERING (Step 1)
# ==========================================
# Custom Metrics
df['performance_age_ratio'] = (df['total_career_goals'] + df['total_career_assists']) / (df['age'] - 15).clip(lower=1)
df['loyalty_index'] = df['days_since_joined'] / (df['total_transfers'] + 1)
df['market_visibility'] = (df['vader_polarity'] + df['tb_polarity']) * df['num_unique_teammates']
df['career_momentum'] = df['most_recent_transfer_fee'] / (df['age'] - 17).clip(lower=1)
# Measures efficiency rather than just total volume
df['efficiency_index'] = (df['total_career_goals'] + df['total_career_assists']) / (df['total_career_minutes_played'] + 1)
# Target Encoding for Club
club_prestige = df.groupby('current_club_name')['latest_market_value'].mean()
df['club_prestige'] = df['current_club_name'].map(club_prestige)
# 1. Scarcity: High output + Rare Age
# cleaning
ids_to_drop = ['player_id', 'player_name', 'player_slug', 'current_club_id', 'current_club_name']
df_clean = df.drop(columns=[col for col in ids_to_drop if col in df.columns], errors='ignore')
df_clean = df_clean.fillna(df_clean.median(numeric_only=True))

# Target Transformation
y = np.log1p(df_clean['latest_market_value'])
X = df_clean.drop(columns=['latest_market_value'])

# ==========================================
# 3. FEATURE SELECTION (Matching DT & RF)
# ==========================================
# A. Decision Tree Importance
dt = DecisionTreeRegressor(max_depth=10, random_state=42).fit(X, y)
dt_importances = pd.Series(dt.feature_importances_, index=X.columns).sort_values(ascending=False)

# B. Random Forest Importance
rf = RandomForestRegressor(n_estimators=100, max_depth=12, random_state=42, n_jobs=-1)
rf.fit(X, y)
rf_importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)


# Select top 20, but ensure 'experience_score' and 'career_momentum' are kept
selected_features = rf_importances.head(20).index.tolist()
print("\nTop Selected Features:", selected_features)
# C. Logic: Keep features that appear in top 20 of both, or are specific engineered features
#top_dt = dt_importances.head(20).index.tolist()
#top_rf = rf_importances.head(20).index.tolist()

# Find overlapping features
#matching_features = list(set(top_dt) & set(top_rf))



#print(f"\nTraining on {len(final_feature_list)} Matched & Engineered Features:")
#print(final_feature_list)

# ==========================================
# 4. FINAL LIGHTGBM TRAINING
# ==========================================
# 1. Define the High-Signal "Gold" Features
gold_features = [
    'club_prestige', 'total_value_at_transfer', 'most_recent_transfer_fee',
    'career_momentum', 'remaining_contract_duration', 'days_since_last_transfer',
    'total_career_matches', 'performance_age_ratio','total_career_minutes_played',
    'total_transfer_fees', 
    'market_visibility', 'loyalty_index',
    'citizenship_freq_encoded','efficiency_index'
    
]


# 2. Filter data
X_final = X[gold_features]

# 3. 60/20/20 Split
X_temp, X_test, y_temp, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# 4. LightGBM with higher "Learning Depth"
final_model = lgb.LGBMRegressor(
    n_estimators=5000,
    learning_rate=0.005,
    num_leaves=50,
    max_depth=10,
    reg_alpha=0.3,   # Higher penalty to ensure no single feature dominates
    reg_lambda=0.3,
    random_state=42
)

final_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[lgb.early_stopping(200), lgb.log_evaluation(period=0)]
)

# 5. Output
y_pred = np.expm1(final_model.predict(X_test))
print(f"Final Optimized R2: {r2_score(np.expm1(y_test), y_pred):.4f}")





# ==========================================



Top Selected Features: ['club_prestige', 'most_recent_transfer_fee', 'total_value_at_transfer', 'total_career_matches', 'birth_year', 'days_since_last_transfer', 'age', 'loyalty_index', 'num_unique_teammates', 'total_transfers', 'career_momentum', 'remaining_contract_duration', 'total_transfer_fees', 'avg_ppg_with_teammates', 'total_career_yellow_cards', 'total_minutes_played_with_teammates', 'performance_age_ratio', 'total_career_minutes_played', 'days_since_joined', 'citizenship_freq_encoded']
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004469 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3130
[LightGBM] [Info] Number of data points in the train set: 55602, number of used features: 14
[LightGBM] [Info] Start training from score 12.506874
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. 