In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [5]:
# Load the dataset
df = pd.read_csv("./normalized_data/1950-51_normalized.csv")


In [7]:
# Check for missing values and data types
print(df.info())

# Check for correlations between variables
print(df.corr())

# Define the features and target variable
X = df[['played', 'won', 'draw', 'lost', 'goals_for', 'goals_against', 'goal_difference', 'points']]
y = df['position']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   position         16 non-null     int64  
 1   team             16 non-null     object 
 2   points           16 non-null     float64
 3   won              16 non-null     float64
 4   lost             16 non-null     float64
 5   draw             16 non-null     float64
 6   played           16 non-null     float64
 7   goals_for        16 non-null     float64
 8   goals_against    16 non-null     float64
 9   goal_difference  16 non-null     float64
 10  year             16 non-null     int64  
dtypes: float64(8), int64(2), object(1)
memory usage: 1.5+ KB
None
                 position    points       won      lost      draw  played  \
position         1.000000 -0.902371 -0.918637  0.873566 -0.417606     NaN   
points          -0.902371  1.000000  0.988092 -0.992251  0.616037     NaN   
w

  print(df.corr())


In [8]:
# Create an instance of the linear regression model
lr_model = LinearRegression()

# Train the model using the training data
lr_model.fit(X_train, y_train)


In [9]:
# Use the trained model to predict on the test data
y_pred = lr_model.predict(X_test)

# Calculate the mean squared error and R-squared score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error: ", mse)
print("R-squared Score: ", r2)


Mean Squared Error:  8.336435422721701
R-squared Score:  0.7266742484353541


In [16]:
# Define the features for the future season
X_future_season = pd.DataFrame({
    'played': [0, 0, 0, 0, 0, 0, 0, 0],
    'won': [0, 0, 0, 0, 0, 0, 0, 0],
    'draw': [0, 0, 0, 0, 0, 0, 0, 0],
    'lost': [0, 0, 0, 0, 0, 0, 0, 0],
    'goals_for': [0, 0, 0, 0, 0, 0, 0, 0],
    'goals_against': [0, 0, 0, 0, 0, 0, 0, 0],
    'goal_difference': [0, 0, 0, 0, 0, 0, 0, 0],
    'points': [0, 0, 0, 0, 0, 0, 0, 0]
})

# Use the trained model to predict the points for the future season
predicted_points = lr_model.predict(X_future_season)

# Create a new DataFrame with the predicted points for each team
predicted_df = pd.DataFrame({
    'Team': ['Real Madrid', 'Barcelona', 'Atletico Madrid', 'Sevilla', 'Real Sociedad', 'Villarreal', 'Real Betis', 'Athletic Bilbao'],
    'PredictedPoints': predicted_points
})

# Sort the DataFrame by predicted points in descending order to see which team is most likely to win
predicted_df = predicted_df.sort_values(by=['PredictedPoints'], ascending=False)
print(predicted_df)


              Team  PredictedPoints
0      Real Madrid        11.020103
1        Barcelona        11.020103
2  Atletico Madrid        11.020103
3          Sevilla        11.020103
4    Real Sociedad        11.020103
5       Villarreal        11.020103
6       Real Betis        11.020103
7  Athletic Bilbao        11.020103
