#LOGISTIC REGRESSION

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import mean_squared_error

# Load the dataset
df = pd.read_csv("goalscorers.csv")

# Preprocess the data
df = df[['home_team', 'away_team', 'minute', 'own_goal', 'penalty', 'team']]
df['team_goal'] = df['team'].eq(df['home_team']).astype(int)
df.drop('team', axis=1, inplace=True)

# One-hot encode the categorical variables
df = pd.get_dummies(df, columns=['home_team', 'away_team'])

# Drop rows with missing values
df.dropna(inplace=True)

# Split the data into training and testing sets
X = df.drop('team_goal', axis=1)
y = df['team_goal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
print(f"Accuracy: {accuracy}")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print("Confusion matrix:\n", confusion_mat)

Accuracy: 0.755
Mean Squared Error: 0.245
Confusion matrix:
 [[ 47  31]
 [ 18 104]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In this example code, we first load the dataset using pandas and preprocess the data by selecting the relevant columns ('home_team', 'away_team', 'minute', 'own_goal', 'penalty', 'team') and creating a new column 'team_goal' to represent whether the team that scored the goal was the home team (1) or the away team (0).

We then one-hot encode the 'home_team' and 'away_team' columns using the get_dummies function from pandas. We then split the data into training and testing sets using the train_test_split() function.

Next, we fit a logistic regression model to the training set using the LogisticRegression() function from scikit-learn. We then make predictions on the testing set using the predict() function and evaluate the performance of the model using the accuracy_score() and confusion_matrix() functions from scikit-learn.

#SVM

In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import mean_squared_error

# Load the dataset
df = pd.read_csv("goalscorers.csv")

# Preprocess the data
df = df[['home_team', 'away_team', 'minute', 'own_goal', 'penalty', 'team']]
df['team_goal'] = df['team'].eq(df['home_team']).astype(int)
df.drop('team', axis=1, inplace=True)

# One-hot encode the categorical variables
df = pd.get_dummies(df, columns=['home_team', 'away_team'])

# Drop rows with missing values
df.dropna(inplace=True)

# Split the data into training and testing sets
X = df.drop('team_goal', axis=1)
y = df['team_goal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit an SVM model
model = SVC(kernel='linear', C=1, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
print(f"Accuracy: {accuracy}")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print("Confusion matrix:\n", confusion_mat)


Accuracy: 0.76
Mean Squared Error: 0.24
Confusion matrix:
 [[ 45  33]
 [ 15 107]]


For SVM, we first import the SVC class from scikit-learn and instantiate a new object with default hyperparameters. We then fit the model to the training data and make predictions on the testing set. Finally, we evaluate the model's performance using the accuracy_score() and confusion_matrix() functions from scikit-learn.

#NAIVE BAYES

In [None]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import mean_squared_error

# Load the dataset
df = pd.read_csv("goalscorers.csv")

# Preprocess the data
df = df[['home_team', 'away_team', 'minute', 'own_goal', 'penalty', 'team']]
df['team_goal'] = df['team'].eq(df['home_team']).astype(int)
df.drop('team', axis=1, inplace=True)

# One-hot encode the categorical variables
df = pd.get_dummies(df, columns=['home_team', 'away_team'])

# Drop rows with missing values
df.dropna(inplace=True)

# Split the data into training and testing sets
X = df.drop('team_goal', axis=1)
y = df['team_goal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit a Naive Bayes model
model = GaussianNB()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"Mean Squared Error: {mse}")
print("Confusion matrix:\n", confusion_mat)


Accuracy: 0.61
Mean Squared Error: 0.39
Confusion matrix:
 [[72  6]
 [72 50]]


In this example code, we also load the dataset using pandas and preprocess the data by selecting the relevant columns ('home_team', 'away_team', 'minute', 'own_goal', 'penalty', 'team') and creating a new column 'team_goal' to represent whether the team that scored the goal was the home team (1) or the away team (0).

We then one-hot encode the 'home_team' and 'away_team' columns using the get_dummies function from pandas. We then split the data into training and testing sets using the train_test_split() function.

Next, we fit a Naive Bayes model to the training set using the GaussianNB() function from scikit-learn. We then make predictions on the testing set using the predict() function and evaluate the performance of the model using the accuracy_score() and confusion_matrix() functions from scikit-learn.

#RANDOM FOREST

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix

# Load the dataset
df = pd.read_csv("goalscorers.csv")

# Preprocess the data
df = df[['home_team', 'away_team', 'minute', 'own_goal', 'penalty', 'team']]
df['team_goal'] = df['team'].eq(df['home_team']).astype(int)
df.drop('team', axis=1, inplace=True)

# One-hot encode the categorical variables
df = pd.get_dummies(df, columns=['home_team', 'away_team'])

# Drop rows with missing values
df.dropna(inplace=True)

# Split the data into training and testing sets
X = df.drop('team_goal', axis=1)
y = df['team_goal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)

# Convert the actual labels to binary labels
y_test_binary = y_test.astype(int)

# Calculate accuracy based on binary predictions
accuracy = accuracy_score(y_test_binary, y_pred_binary)

# Evaluate the model based on mean squared error
mse = mean_squared_error(y_test, y_pred)
confusion_mat = confusion_matrix(y_test_binary, y_pred_binary)
print(f"Accuracy: {accuracy}")
print(f"Mean Squared Error: {mse}")
print("Confusion matrix:\n", confusion_mat)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['team_goal'] = df['team'].eq(df['home_team']).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop('team', axis=1, inplace=True)


Accuracy: 0.6468711656441718
Mean Squared Error: 0.3531288343558282
Confusion matrix:
 [[1749 1506]
 [1372 3523]]


In this example code, we first load the dataset using pandas and preprocess the data by selecting the relevant columns ('home_team', 'away_team', 'minute', 'own_goal', 'penalty', 'team') and creating a new column 'team_goal' to represent whether the team that scored the goal was the home team (1) or the away team (0).

We then one-hot encode the 'home_team' and 'away_team' columns using the get_dummies function from pandas. We then split the data into training and testing sets using the train_test_split() function and a Random Forest Regression model is used instead of a Logistic Regression model.

#XGBOOST

In [None]:
# Import necessary libraries
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix

# Load the dataset
df = pd.read_csv("goalscorers.csv")

# Preprocess the data
df = df[['home_team', 'away_team', 'minute', 'own_goal', 'penalty', 'team']]
df['team_goal'] = df['team'].eq(df['home_team']).astype(int)
df.drop('team', axis=1, inplace=True)

# One-hot encode the categorical variables
df = pd.get_dummies(df, columns=['home_team', 'away_team'])

# Drop rows with missing values
df.dropna(inplace=True)

# Convert object columns to numerical data types
X_train['own_goal'] = X_train['own_goal'].astype(int)
X_train['penalty'] = X_train['penalty'].astype(int)

X_test['own_goal'] = X_test['own_goal'].astype(int)
X_test['penalty'] = X_test['penalty'].astype(int)

# Train an XGBoost model
import xgboost as xgb

params = {
    'objective': 'binary:logistic',
    'max_depth': 3,
    'learning_rate': 0.1,
    }
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
model = xgb.train(params, dtrain)

# Make predictions on the testing set
y_pred = model.predict(dtest)
y_pred_binary = (y_pred > 0.5).astype(int)

# Convert the actual labels to binary labels
y_test_binary = y_test.astype(int)

# Calculate accuracy based on binary predictions
accuracy = accuracy_score(y_test_binary, y_pred_binary)

# Evaluate the model based on mean squared error
mse = mean_squared_error(y_test, y_pred)
confusion_mat = confusion_matrix(y_test_binary, y_pred_binary)
print(f"Accuracy: {accuracy}")
print(f"Mean Squared Error: {mse}")
print("Confusion matrix:\n", confusion_mat)



Accuracy: 0.68
Mean Squared Error: 0.2166228046022539
Confusion matrix:
 [[ 21  57]
 [  7 115]]


For XGBoost, we first import the XGBClassifier class from the xgboost library and instantiate a new object with default hyperparameters. We then fit the model to the training data and make predictions on the testing set. Finally, we evaluate the model's performance using the accuracy_score() and confusion_matrix() functions from scikit-learn.