In [None]:
import pandas as pd
df=pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Project_MP\Data\Feature_Selection\Filtered_Features.csv")
df.info()

# OVERFITTING

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# === 1Ô∏è‚É£ Ma'lumotlarni tayyorlash ===
# Masalan, maqsad ustuni 'target' bo'lsin (o'zingiznikini yozing)
X = df.drop(columns=['target'], errors='ignore')
y = df['target']

# === 2Ô∏è‚É£ Raqamli va kategorik ustunlarni aniqlash ===
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# === 3Ô∏è‚É£ Oldindan ishlov berish (scaling + one-hot encoding) ===
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# === 4Ô∏è‚É£ Model pipeline ===
model = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', DecisionTreeClassifier(max_depth=None, random_state=42))
])

# === 5Ô∏è‚É£ Train/Test bo‚Äòlish ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# === 6Ô∏è‚É£ Modelni o‚Äòqitish ===
model.fit(X_train, y_train)

# === 7Ô∏è‚É£ Baholash ===
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("üî• CLASSIFICATION MODEL RESULTS")
print("-------------------------------")
print(f"‚úÖ Train Accuracy: {accuracy_score(y_train, y_train_pred):.3f}")
print(f"‚úÖ Test  Accuracy: {accuracy_score(y_test, y_test_pred):.3f}")
print("\nüßæ Classification Report:")
print(classification_report(y_test, y_test_pred))

print("\nüìä Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))




In [None]:
# === 8Ô∏è‚É£ Modelni saqlash ===
joblib.dump(model, "decision_tree_classification.joblib")
loaded_overfit_model = joblib.load("overfitting_model.joblib")

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

# --- Offline unseen player (all features including id, player_name) ---
new_player = pd.DataFrame({
    'id': [999],
    'player_name': ['Khusanov'],
    'games': [25],
    'time': [2100],
    'goals': [8],
    'assists': [5],
    'xA': [1.9],
    'shots': [45],
    'key_passes': [20],
    'yellow_cards': [2],
    'red_cards': [0],
    'position': ['FW'],
    'team_title': ['Manchester City'],
    'npg': [7],
    'npxG': [6.8],
    'xGChain': [7.5],
    'xGBuildup': [3.2]
})

# --- Predict for offline player ---
pred_xG = loaded_overfit_model.predict(new_player)[0]

# --- Compute RMSE from original test set manually ---
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("\n--- Offline (Unseen) Player Test ---")
print(f"Predicted xG for {new_player['player_name'][0]}: {pred_xG:.2f} ¬± {rmse_test:.2f} (expected error based on test set)")


# UNDERFITTING

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
import joblib

# --- Features and target ---
X = df.drop(columns=['xG'], errors='ignore')  # Keep id and player_name
y = df['xG']

# --- Identify numeric/categorical ---
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# --- Preprocessing ---
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# --- Overfitting model ---
overfit_model = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', DecisionTreeRegressor(max_depth=1, random_state=42))
])

# --- Split train/test ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- Train ---
overfit_model.fit(X_train, y_train)

# --- Evaluate ---
y_train_pred = overfit_model.predict(X_train)
y_test_pred = overfit_model.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("üî• Underfitting MODEL RESULTS")
print("----------------------------")
print(f"Train R¬≤: {r2_score(y_train, y_train_pred):.3f}")
print(f"Test  R¬≤: {r2_score(y_test, y_test_pred):.3f}")
print(f"Test RMSE: {rmse_test:.3f}")

In [None]:
# --- Save and reload ---
joblib.dump(overfit_model, "underfitting_model.joblib")
loaded_overfit_model = joblib.load("underfitting_model.joblib")

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

# --- Offline unseen player (all features including id, player_name) ---
new_player = pd.DataFrame({
    'id': [999],
    'player_name': ['Khusanov'],
    'games': [25],
    'time': [2100],
    'goals': [8],
    'assists': [5],
    'xA': [1.9],
    'shots': [45],
    'key_passes': [20],
    'yellow_cards': [2],
    'red_cards': [0],
    'position': ['FW'],
    'team_title': ['Manchester City'],
    'npg': [7],
    'npxG': [6.8],
    'xGChain': [7.5],
    'xGBuildup': [3.2]
})

# --- Predict for offline player ---
pred_xG = loaded_overfit_model.predict(new_player)[0]

# --- Compute RMSE from original test set manually ---
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("\n--- Offline (Unseen) Player Test ---")
print(f"Predicted xG for {new_player['player_name'][0]}: {pred_xG:.2f} ¬± {rmse_test:.2f} (expected error based on test set)")
