In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Example DataFrame with expected data
data = {
    'minutes': [90, 90, 90, 90, 90, 90, 90, 90, 90, 90],
    'xG': [0.6, 0.1, 0.0, 1.1, 0.1, 0.5, 0.2, 1.0, 0.9, 0.1],
    'xA': [0.6, 0.1, 0.0, 0.3, 0.1, 0.5, 0.2, 0.2, 0.1, 0.1],
    'clean_sheets': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
    'opponent_strength': [1, 3, 5, 2, 2, 4, 5, 3, 1, 1],
    'home_away': [0, 1, 0, 1, 0, 0, 1, 0, 1, 1],
    'fpl_points': [2, 4, 5, 11, 2, 8, 9, 15, 18, 4]
}

df = pd.DataFrame(data)

# Define features and target
X = df[['minutes', 'xG', 'xA', 'clean_sheets', 'opponent_strength', 'home_away']]
y = df['fpl_points']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict using the test set
predictions = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
print(f'Predictions: {predictions}')
print(f'Actual Points: {y_test.values}')


Mean Squared Error: 39.13325
Predictions: [9.32 5.71]
Actual Points: [18  4]


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Example DataFrame with expected data
data = {
    'minutes': [90, 90, 90, 90, 90, 90, 90, 90, 90, 90],
    'xG': [0.6, 0.1, 0.0, 1.1, 0.1, 0.5, 0.2, 1.0, 0.9, 0.1],
    'xA': [0.6, 0.1, 0.0, 0.3, 0.1, 0.5, 0.2, 0.2, 0.1, 0.1],
    'clean_sheets': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
    'opponent_strength': [1, 3, 5, 2, 2, 4, 5, 3, 1, 1],
    'home_away': [0, 1, 0, 1, 0, 0, 1, 0, 1, 1],
    'fpl_points': [2, 4, 5, 11, 2, 8, 9, 15, 18, 4]
}

df = pd.DataFrame(data)

# Define features and target
X = df[['minutes', 'xG', 'xA', 'clean_sheets', 'opponent_strength', 'home_away']]
y = df['fpl_points']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict using the entire scaled dataset
all_predictions = model.predict(X_scaled)

# Evaluate the model
mse = mean_squared_error(y, all_predictions)
print(f'Total Mean Squared Error: {mse}')

# Display predictions and actual points for all data
results_df = pd.DataFrame({'Predicted Points': all_predictions, 'Actual Points': y})
print(results_df)


Total Mean Squared Error: 9.940159999999999
   Predicted Points  Actual Points
0              4.34              2
1              5.71              4
2              5.65              5
3             10.64             11
4              3.56              2
5              7.69              8
6              8.04              9
7             11.60             15
8              9.32             18
9              4.31              4


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

# Example DataFrame with expected data
data = {
    'minutes': [90, 90, 90, 90, 90, 90, 90, 90, 90, 90],
    'xG': [0.6, 0.1, 0.0, 1.1, 0.1, 0.5, 0.2, 1.0, 0.9, 0.1],
    'xA': [0.6, 0.1, 0.0, 0.3, 0.1, 0.5, 0.2, 0.2, 0.1, 0.1],
    'clean_sheets': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
    'opponent_strength': [1, 3, 5, 2, 2, 4, 5, 3, 1, 1],
    'home_away': [0, 1, 0, 1, 0, 0, 1, 0, 1, 1],
    'fpl_points': [2, 4, 5, 11, 2, 8, 9, 15, 18, 4]
}

df = pd.DataFrame(data)

# Feature engineering
df['xG_per_minute'] = df['xG'] * df['minutes'] / 90  # Normalized by minutes in a full match
df['interaction'] = df['xG'] * df['opponent_strength']

# Define features and target
X = df[['minutes', 'xG', 'xA', 'clean_sheets', 'opponent_strength', 'home_away', 'xG_per_minute', 'interaction']]
y = df['fpl_points']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model setup with XGBoost
model = XGBRegressor(objective ='reg:squarederror', n_estimators=100, learning_rate=0.05, max_depth=5)

# Grid Search for Hyperparameter Tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7]
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Best model
print("Best parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Predictions and Evaluation
predictions = best_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f'Optimized MSE: {mse}')
print(f'Predictions: {predictions}')
print(f'Actual Points: {y_test.values}')

# Cross-Validation
cv_scores = cross_val_score(best_model, X_scaled, y, cv=5, scoring='neg_mean_squared_error')
print(f'CV Average MSE: {-np.mean(cv_scores)}')


Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Optimized MSE: 89.22657702452216
Predictions: [4.6575575 4.6575575]
Actual Points: [18  4]
CV Average MSE: 27.071315726859986


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Example DataFrame with expected data
data = {
    'minutes': [90, 90, 90, 90, 90, 90, 90, 90, 90, 90],
    'xG': [0.6, 0.1, 0.0, 1.1, 0.1, 0.5, 0.2, 1.0, 0.9, 0.1],
    'xA': [0.6, 0.1, 0.0, 0.3, 0.1, 0.5, 0.2, 0.2, 0.1, 0.1],
    'clean_sheets': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
    'opponent_strength': [1, 3, 5, 2, 2, 4, 5, 3, 1, 1],
    'home_away': [0, 1, 0, 1, 0, 0, 1, 0, 1, 1],
    'fpl_points': [2, 4, 5, 11, 2, 8, 9, 15, 18, 4]
}

df = pd.DataFrame(data)

# Feature engineering
df['xG_per_minute'] = df['xG'] * df['minutes'] / 90  # Normalized by minutes in a full match
df['interaction'] = df['xG'] * df['opponent_strength']

# Define features and target
X = df[['minutes', 'xG', 'xA', 'clean_sheets', 'opponent_strength', 'home_away', 'xG_per_minute', 'interaction']]
y = df['fpl_points']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the entire dataset
model.fit(X_scaled, y)

# Predict using the entire scaled dataset
all_predictions = model.predict(X_scaled)

# Evaluate the model
mse = mean_squared_error(y, all_predictions)
print(f'Total Mean Squared Error: {mse}')

# Display predictions and actual points for all data
results_df = pd.DataFrame({
    'Predicted Points': all_predictions,
    'Actual Points': y
})
print(results_df)


Total Mean Squared Error: 2.5949899999999997
   Predicted Points  Actual Points
0              5.79              2
1              3.44              4
2              4.78              5
3             12.47             11
4              2.69              2
5              8.79              8
6              7.79              9
7             14.33             15
8             15.56             18
9              3.69              4


In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Historical data example (simplified for Bruno Fernandes)
data = {
    'minutes': [90] * 10,
    'xG': [0.2, 0.1, 0.4, 0.3, 0.2, 0.15, 0.25, 0.05, 0.3, 0.2],
    'xA': [0.1, 0.2, 0.05, 0.15, 0.1, 0.2, 0.1, 0.05, 0.15, 0.1],
    'clean_sheets': [1, 0, 0, 1, 1, 0, 0, 1, 1, 0],
    'opponent_strength': [1, 3, 2, 4, 2, 3, 2, 3, 4, 3],  # 1 is strongest, 4 is weakest
    'home_away': [1, 0, 1, 0, 1, 1, 0, 1, 0, 1],
    'fpl_points': [6, 7, 11, 10, 7, 8, 5, 12, 9, 6]
}

df = pd.DataFrame(data)

# Define features and target
X = df[['minutes', 'xG', 'xA', 'clean_sheets', 'opponent_strength', 'home_away']]
y = df['fpl_points']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize and train the RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_scaled, y)

# Upcoming game data including contextual predictions
upcoming_game_data = {
    'minutes': [90],  # Expected to play full time
    'xG': [2.2 * (0.3 / 3)],  # Bruno's average xG contribution considering team's predicted goals
    'xA': [2.2 * (0.2 / 3)],  # Bruno's average xA contribution
    'clean_sheets': [0.4],  # Team's predicted clean sheet probability
    'opponent_strength': [19],  # 2nd worst in the league, scaled similarly
    'home_away': [0]  # Away game
}

upcoming_game_df = pd.DataFrame(upcoming_game_data)
upcoming_game_scaled = scaler.transform(upcoming_game_df)

# Predict using the model
predicted_points = model.predict(upcoming_game_scaled)
print(f'Predicted FPL Points for Bruno Fernandes in the Next Game: {predicted_points[0]}')


Predicted FPL Points for Bruno Fernandes in the Next Game: 7.468000000000001


In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

# Example DataFrame with simplified historical data
data = {
    'xG': [0.2, 0.1, 0.4, 0.3, 0.2, 0.15, 0.25, 0.05, 0.3, 0.2],
    'xA': [0.1, 0.2, 0.05, 0.15, 0.1, 0.2, 0.1, 0.05, 0.15, 0.1],
    'clean_sheets': [1, 0, 0, 1, 1, 0, 0, 1, 1, 0],  # Assume this is for the whole team, applicable if midfielder
    'fpl_points': [6, 7, 11, 10, 7, 8, 5, 12, 9, 6]
}

df = pd.DataFrame(data)

# Define features and target
X = df[['xG', 'xA', 'clean_sheets']]

# Custom scoring function based on FPL rules
def calculate_fpl_points(row):
    points = 0
    points += row['xG'] * 5  # 5 points per goal for midfielders
    points += row['xA'] * 3  # 3 points per assist
    points += row['clean_sheets'] * 1  # 1 point for a clean sheet for midfielders
    points += 2  # Assuming Bruno plays more than 60 minutes
    return points

# Apply scoring function to generate expected points from historical data
df['calculated_points'] = df.apply(calculate_fpl_points, axis=1)

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train the RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_scaled, df['calculated_points'])

# Hypothetical upcoming game data with predicted stats
upcoming_game_data = {
    'xG': [0.25],  # Predicted xG for the game
    'xA': [0.15],  # Predicted xA for the game
    'clean_sheets': [0.4]  # Clean sheet probability converted to expected value (assuming binary input needed)
}

# Prepare data frame for prediction
upcoming_game_df = pd.DataFrame(upcoming_game_data)
upcoming_game_scaled = scaler.transform(upcoming_game_df)

# Predict using the trained model
predicted_points = model.predict(upcoming_game_scaled)
print(f'Predicted FPL Points for Bruno Fernandes in the Next Game: {predicted_points[0]}')


Predicted FPL Points for Bruno Fernandes in the Next Game: 3.914000000000003


In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

# Historical data example
data = {
    'xG': [0.2, 0.1, 0.4, 0.3, 0.2, 0.15, 0.25, 0.05, 0.3, 0.2],
    'xA': [0.1, 0.2, 0.05, 0.15, 0.1, 0.2, 0.1, 0.05, 0.15, 0.1],
    'clean_sheets': [1, 0, 0, 1, 1, 0, 0, 1, 1, 0],
    'bonus': [3, 1, 2, 3, 1, 0, 2, 1, 3, 2],  # Simulated bonus points
    'fpl_points': [6, 7, 11, 10, 7, 8, 5, 12, 9, 6]
}

df = pd.DataFrame(data)

# Features and target for prediction
X = df[['xG', 'xA', 'clean_sheets']]
y = df['fpl_points'] + df['bonus']  # Including bonus points in the target

# Scaler and RandomForestRegressor setup
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_scaled, y)

# Upcoming game prediction setup
upcoming_game_data = {
    'xG': [0.25],  # Predicted xG for the game
    'xA': [0.15],  # Predicted xA for the game
    'clean_sheets': [0.4]  # Converted probability
}

upcoming_game_df = pd.DataFrame(upcoming_game_data)
upcoming_game_scaled = scaler.transform(upcoming_game_df)

# Prediction
predicted_points = model.predict(upcoming_game_scaled)
print(f'Predicted FPL Points for Bruno Fernandes in the Next Game: {predicted_points[0]}')


Predicted FPL Points for Bruno Fernandes in the Next Game: 7.625


In [18]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

# Example DataFrame with simplified historical data
data = {
    # 'xG': [0.2, 0.1, 0.4, 0.3, 0.2, 0.15, 0.25, 0.05, 0.3, 0.2],
    # 'xA': [0.1, 0.2, 0.05, 0.15, 0.1, 0.2, 0.1, 0.05, 0.15, 0.1],
    # 'clean_sheets': [1, 0, 0, 1, 1, 0, 0, 1, 1, 0],  # Assume Bruno is a midfielder
    'xG': [0.6, 0.1, 0.0, 1.1, 0.1, 0.5, 0.2, 1.0, 0.9, 0.1],
    'xA': [0.6, 0.1, 0.0, 0.3, 0.1, 0.5, 0.2, 0.2, 0.1, 0.1],
    'clean_sheets': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
    'bonus': [0, 0, 0, 3, 0, 1, 2, 3, 3, 2]  # Simulated bonus points
}

df = pd.DataFrame(data)

# Modify scoring function to reflect FPL rules more realistically
def calculate_fpl_points(row):
    points = 2  # Assume at least 60 minutes of playtime
    points += row['xG'] * 5  # Goals by midfielders score 5 points
    points += row['xA'] * 3  # Each assist scores 3 points
    points += row['clean_sheets'] * 1  # 1 point for a clean sheet for midfielders
    return points

# Applying scoring function to generate expected points from historical data
df['calculated_points'] = df.apply(calculate_fpl_points, axis=1)

# Features and target
X = df[['xG', 'xA', 'clean_sheets']]
y = df['calculated_points']  # Target now includes calculated points instead of raw fpl_points

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train the RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_scaled, y)

# Upcoming game prediction
upcoming_game_data = {
    'xG': [0.95],  # Reduced expected xG
    'xA': [0.10],  # Reduced expected xA
    'clean_sheets': [0.4]  # Reduced probability of clean sheets
}

def estimate_bps(row):
    bps_score = 0
    bps_score += row['xG'] * 25  # Assuming higher weight for goals
    bps_score += row['xA'] * 15  # Assuming assists also carry significant weight
    bps_score += row['clean_sheets'] * 5  # Add for clean sheets if applicable
    return bps_score

def assign_bonus_points(bps_scores, n_players=11):
    sorted_scores = sorted(bps_scores, reverse=True)
    bonus_points = [0] * len(bps_scores)
    if len(sorted_scores) > 0:
        max_score = sorted_scores[0]
        bonus_points[bps_scores.index(max_score)] = 3  # 3 points to highest BPS
    if len(sorted_scores) > 1:
        second_score = sorted_scores[1]
        bonus_points[bps_scores.index(second_score)] = 2  # 2 points to second highest BPS
    if len(sorted_scores) > 2:
        third_score = sorted_scores[2]
        bonus_points[bps_scores.index(third_score)] = 1  # 1 point to third highest BPS
    return bonus_points

# Simulating within team or match context - simplification
upcoming_game_df['bps_score'] = upcoming_game_df.apply(estimate_bps, axis=1)
bps_scores = list(upcoming_game_df['bps_score'])  # Assuming this is just for one team or key players
bonus_points = assign_bonus_points(bps_scores)

# Add bonus points to the predicted points
predicted_points_with_bonus = predicted_points[0] + bonus_points[0]
print(f'Predicted FPL Points with Bonus for Bruno Fernandes in the Next Game: {predicted_points_with_bonus}')



Predicted FPL Points with Bonus for Bruno Fernandes in the Next Game: 6.390500000000005


In [19]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

# Data setup
data = {
    'xG': [0.6, 0.1, 0.0, 1.1, 0.1, 0.5, 0.2, 1.0, 0.9, 0.1],
    'xA': [0.6, 0.1, 0.0, 0.3, 0.1, 0.5, 0.2, 0.2, 0.1, 0.1],
    'clean_sheets': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
    'bonus': [0, 0, 0, 3, 0, 1, 2, 3, 3, 2]
}
df = pd.DataFrame(data)

# Scoring function
def calculate_fpl_points(row):
    points = 2  # Base points for playing over 60 minutes
    points += row['xG'] * 5  # Goals by midfielders
    points += row['xA'] * 3  # Assists
    points += row['clean_sheets']  # Clean sheets for midfielders
    return points

df['calculated_points'] = df.apply(calculate_fpl_points, axis=1)

# Features and target
X = df[['xG', 'xA', 'clean_sheets']]
y = df['calculated_points']

# Scaling and model training
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_scaled, y)

# Upcoming game prediction setup
upcoming_game_data = {
    'xG': [0.25],  # Adjusted expected xG
    'xA': [0.07],  # Adjusted expected xA
    'clean_sheets': [0.4]  # Adjusted probability of clean sheets
}
upcoming_game_df = pd.DataFrame(upcoming_game_data)
upcoming_game_scaled = scaler.transform(upcoming_game_df)

# Predict using the trained model
predicted_points = model.predict(upcoming_game_scaled)
print(f'Predicted FPL Points for Bruno Fernandes in the Next Game: {predicted_points[0]}')

# Bonus point adjustment
upcoming_game_df['bps_score'] = upcoming_game_df.apply(estimate_bps, axis=1)
bps_scores = list(upcoming_game_df['bps_score'])  # Assuming this is just for one team or key players
bonus_points = assign_bonus_points(bps_scores)

# Combine predicted points with bonus
predicted_points_with_bonus = predicted_points[0] + bonus_points[0]
print(f'Predicted FPL Points with Bonus for Bruno Fernandes in the Next Game: {predicted_points_with_bonus}')


Predicted FPL Points for Bruno Fernandes in the Next Game: 3.064000000000003
Predicted FPL Points with Bonus for Bruno Fernandes in the Next Game: 6.064000000000004


In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor

# Sample DataFrame for Bruno Fernandes
data = {
    'xG': [0.6, 0.1, 0.0, 1.1, 0.1, 0.5, 0.2, 1.0, 0.9, 0.1],
    'xA': [0.6, 0.1, 0.0, 0.3, 0.1, 0.5, 0.2, 0.2, 0.1, 0.1],
    'clean_sheets': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
    'opponent_strength': [2, 8, 6, 4, 5, 3, 7, 1, 9, 10],  # 10 being the strongest
    'bonus': [0, 0, 0, 3, 0, 1, 2, 3, 3, 2],  # Actual bonus points collected
    'fpl_points': [6, 7, 11, 10, 7, 8, 5, 12, 9, 6]
}

df = pd.DataFrame(data)

# Prepare features and target
X = df[['xG', 'xA', 'clean_sheets', 'opponent_strength']]
y = df['fpl_points'] + df['bonus']  # Combine points and bonus

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the LGBMRegressor
model = LGBMRegressor(n_estimators=100, learning_rate=0.1)
model.fit(X_train, y_train)

# Predict and evaluate the model
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
print(f'Predictions: {predictions}')

# Bonus point prediction setup could be refined similarly with a custom approach


ModuleNotFoundError: No module named 'lightgbm'

In [21]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Sample data including expected metrics
data = {
    'xG': [0.6, 0.1, 0.0, 1.1, 0.1, 0.5, 0.2, 1.0, 0.9, 0.1],
    'xA': [0.6, 0.1, 0.0, 0.3, 0.1, 0.5, 0.2, 0.2, 0.1, 0.1],
    'xCS': [0.2, 0.1, 0.0, 0.4, 0.1, 0.3, 0.2, 0.0, 0.3, 0.2],
    'fpl_points': [6, 7, 11, 10, 7, 8, 5, 12, 9, 6]
}

df = pd.DataFrame(data)

# Expected Points Calculation
def calculate_expected_points(xG, xA, xCS):
    # Points from goals, assists and clean sheets
    points_from_goals = xG * 5  # 5 points per goal for midfielders
    points_from_assists = xA * 3  # 3 points per assist
    points_from_cleansheets = xCS * 1  # 1 point for a clean sheet (if applicable)
    return points_from_goals + points_from_assists + points_from_cleansheets

# Add expected points to DataFrame
df['expected_points'] = df.apply(lambda row: calculate_expected_points(row['xG'], row['xA'], row['xCS']), axis=1)

# Features and target
X = df[['xG', 'xA', 'xCS']]
y = df['expected_points']

# Model Training
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predicting and Evaluating the model
predicted_points = model.predict(X_test)


In [28]:
# Define the expected performance metrics against Burnley
bruno_vs_burnley = np.array([[0.9, 0.9, 0.4]])  # [xG, xA, xCS]

# Scale the features using the same scaler used during training
bruno_vs_burnley_scaled = scaler.transform(bruno_vs_burnley)

# Predict FPL points using the RandomForest model
predicted_points_burnley = model.predict(bruno_vs_burnley_scaled)

print(f"Predicted FPL Points for Bruno Fernandes against Burnley: {predicted_points_burnley[0]:.2f}")


Predicted FPL Points for Bruno Fernandes against Burnley: 5.34




In [None]:
# Get the current working directory
current_dir = os.getcwd()
# base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_dir))))
new_directory = os.path.join(current_dir, "data", "databases")

# Create a new SQLite database (or connect to an existing one)
db_name = os.path.join(new_directory, 'fbref_data_players_latest.db')
conn = sqlite3.connect(db_name)
test_df_new = pd.read_sql_query('SELECT * FROM general', conn)
conn.close()