# Model Training
This is the actual file that trains the model based on the augmented data from the data augmentation file.

## Importing the required libraries

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

## Importing the dataset

In [36]:
df = pd.read_csv('player_data.csv')

## Select relevant features

In [37]:
features = ['MP', 'Starts', 'Min', '90s', 'Gls', 'Ast', 'G-PK', 'PK', 'PKatt', 
            'CrdY', 'CrdR', 'xG', 'npxG', 'xAG', 'npxG+xAG', 
            'PrgC', 'PrgP', 'PrgR']

df['Goals_per_90'] = df['Gls'] / df['90s']
df['Assists_per_90'] = df['Ast'] / df['90s']
df['xG_per_90'] = df['xG'] / df['90s']
df['xAG_per_90'] = df['xAG'] / df['90s']
df['PrgC_per_90'] = df['PrgC'] / df['90s']
df['PrgP_per_90'] = df['PrgP'] / df['90s']
df['PrgR_per_90'] = df['PrgR'] / df['90s']

## Adjust the features list to include the new features

In [38]:

features += ['Goals_per_90', 'Assists_per_90', 'xG_per_90', 'xAG_per_90', 
             'PrgC_per_90', 'PrgP_per_90', 'PrgR_per_90']

## Define weights for each feature

In [39]:
feature_weights = {
    'MP': 1,
    'Starts': 1,
    'Min': 1,
    '90s': 1,
    'Gls': 1,
    'Ast': 3,  # Increase importance for assists
    'G-PK': 1,
    'PK': 1,
    'PKatt': 1,
    'CrdY': 0.5,  # Lower importance for yellow cards
    'CrdR': 0.5,  # Lower importance for red cards
    'xG': 2,  # Assign importance based on expected goals
    'npxG': 2,
    'xAG': 3,  # Increase importance for expected assists
    'npxG+xAG': 2,
    'PrgC': 1,
    'PrgP': 1,
    'PrgR': 1,
    'Goals_per_90': 1,
    'Assists_per_90': 3,
    'xG_per_90': 2,
    'xAG_per_90': 3,
    'PrgC_per_90': 1,
    'PrgP_per_90': 1,
    'PrgR_per_90': 1
}

## Apply weights to features

In [40]:
weighted_df = pd.DataFrame()
for feature in features:
    weighted_df[feature] = df[feature] * feature_weights[feature]

## Training a simple linear regression model

In [41]:
# Prepare input features and target variables
X = weighted_df
y = df[['Gls', 'Ast']]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train);

## Running the model

In [42]:
# Hypothetical scenario: Predict after 50 matches
hypothetical_matches = 50

# Create an empty DataFrame to store predictions for each player
predictions_df = pd.DataFrame(columns=['Player', 'Predicted_Gls', 'Predicted_Ast'])

# Iterate through each player in the dataset
for index, player_data in df.iterrows():
    # Adjust MP for the hypothetical scenario, considering a limit of 50 matches
    hypothetical_data = pd.DataFrame({
        'MP': [min(hypothetical_matches, player_data['MP'])],
        'Starts': [player_data['Starts']],
        'Min': [player_data['Min']],
        '90s': [player_data['90s']],
        'Gls': [player_data['Gls']],
        'Ast': [player_data['Ast']],
        'G-PK': [player_data['G-PK']],
        'PK': [player_data['PK']],
        'PKatt': [player_data['PKatt']],
        'CrdY': [player_data['CrdY']],
        'CrdR': [player_data['CrdR']],
        'xG': [player_data['xG']],
        'npxG': [player_data['npxG']],
        'xAG': [player_data['xAG']],
        'npxG+xAG': [player_data['npxG+xAG']],
        'PrgC': [player_data['PrgC']],
        'PrgP': [player_data['PrgP']],
        'PrgR': [player_data['PrgR']],
        'Goals_per_90': [player_data['Goals_per_90']],
        'Assists_per_90': [player_data['Assists_per_90']],
        'xG_per_90': [player_data['xG_per_90']],
        'xAG_per_90': [player_data['xAG_per_90']],
        'PrgC_per_90': [player_data['PrgC_per_90']],
        'PrgP_per_90': [player_data['PrgP_per_90']],
        'PrgR_per_90': [player_data['PrgR_per_90']]
    })

    # Apply weights to the hypothetical data
    hypothetical_data_weighted = hypothetical_data * [feature_weights[feature] for feature in features]
    hypothetical_data_scaled = scaler.transform(hypothetical_data_weighted)

    predictions_hypothetical = model.predict(hypothetical_data_scaled)
    predictions_hypothetical = predictions_hypothetical.clip(min=0)
    predictions_hypothetical = predictions_hypothetical.round(2)
    predictions_hypothetical = predictions_hypothetical / player_data['MP'] * hypothetical_matches

    predictions_df = pd.concat([predictions_df, pd.DataFrame({
        'Player': [player_data['Player']],
        'Predicted_Gls': [predictions_hypothetical[0, 0]],
        'Predicted_Ast': [predictions_hypothetical[0, 1]]
    })], ignore_index=True)

## Arranging the players

In [43]:
# Rounding off the predictions to the ceiling integer
predictions_df['Predicted_Gls'] = predictions_df['Predicted_Gls'].apply(lambda x: int(x + 1))
predictions_df['Predicted_Ast'] = predictions_df['Predicted_Ast'].apply(lambda x: int(x + 1))

# Now arranging the players in descending order of predicted goals and assists
predictions_df['Total_Predicted'] = predictions_df['Predicted_Gls'] + predictions_df['Predicted_Ast']
predictions_df = predictions_df.sort_values(by='Total_Predicted', ascending=False)

# Display the predictions for each player
print(predictions_df)

                 Player  Predicted_Gls  Predicted_Ast  Total_Predicted
2       Jude Bellingham             30             10               40
9       Vinicius Júnior             21             15               36
11               Joselu             18              9               27
12           Toni Kroos              4             19               23
13          Luka Modrić              2             19               21
1     Federico Valverde              8              9               17
4         Dani Carvajal              5             11               16
10          Fran Garcia              2             12               14
7               Rodrygo              8              5               13
0       Antonio Rüdiger              7              4               11
5           David Alaba              2              5                7
6   Aurélien Tchouaméni              4              2                6
3     Kepa Arrizabalaga              2              2                4
8     

## Saving the predictions

In [44]:
# Save the predictions to a CSV file
predictions_df.to_csv('predictions.csv', index=False)

In [45]:
# Make predictions on the testing set
predictions = model.predict(X_test)

# To get rid of the warning, UserWarning: X has feature names, but RandomForestRegressor was fitted without feature names
# warnings.warn("X has feature names, but RandomForestRegressor was fitted without feature names"), we need to use the following code
model.feature_names_in_ = features


# Calculate the mean squared error of the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 2.217133333333333


