## Import Dependencies

In [48]:
import pandas as pd
from IPython.display import display
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import GaussianNB

In [49]:
# Read data and drop redundant column.
dataset = pd.read_csv('E:/学习/UCL/year-3/comp0036/new_epl_training.csv')

# Convert Date column to datetime type
dataset['Date'] = pd.to_datetime(dataset['Date'])

# Preview data.
display(dataset.head())

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HomeTeam_HomeGoalsRatio,AwayTeam_AwayGoalsRatio,HomeTeam_HomeWinsRatio,AwayTeam_AwayWinsRatio,EPL,HomeTeam_CupGamesLastMonth,AwayTeam_CupGamesLastMonth,HomeTeam_Morale,AwayTeam_Morale,Curse_Parameter
0,2000-08-19,Charlton,Man City,4,0.0,H,2,0,H,Rob Harris,...,0.62,0.512195,0.785714,0.5,1,0,0,0,0,0
1,2000-08-19,Chelsea,West Ham,4,2.0,H,1,0,H,Graham Barber,...,0.647059,0.466667,0.764706,0.4,1,0,0,0,0,0
2,2000-08-19,Coventry,Middlesbrough,1,3.0,A,1,1,D,Barry Knight,...,0.388889,0.590909,0.5,0.555556,1,0,0,0,0,0
3,2000-08-19,Derby,Southampton,2,2.0,D,1,2,A,Andy D'Urso,...,0.621622,0.325,0.8,0.214286,1,0,0,0,0,0
4,2000-08-19,Leeds,Everton,2,0.0,H,2,0,H,Dermot Gallagher,...,0.5625,0.355556,0.55,0.454545,1,0,0,0,0,0


## Preparing the Data

In [50]:
# Handle missing values
dataset.fillna(0, inplace=True)

# Encode categorical variables
le = LabelEncoder()
dataset['FTR'] = le.fit_transform(dataset['FTR'])
dataset['HTR'] = le.fit_transform(dataset['HTR'])
dataset['HomeTeam'] = le.fit_transform(dataset['HomeTeam'])
dataset['AwayTeam'] = le.fit_transform(dataset['AwayTeam'])
dataset['Referee'] = le.fit_transform(dataset['Referee'])
dataset['Season'] = le.fit_transform(dataset['Season'])

# Drop non-numeric and irrelevant columns
X = dataset.drop(['FTR', 'FTHG', 'FTAG', 'Date'], axis=1)
y = dataset['FTR']

# Perform oversampling on the training set
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X,y)

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.20, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Print the sizes of the sets
print("Training set size: {}".format(len(X_train)))
print("Validation set size: {}".format(len(X_val)))
print("Test set size: {}".format(len(X_test)))

# Show the feature information by printing the first five rows
print("\nFeature values:")
display(X.head())

Training set size: 13001
Validation set size: 1625
Test set size: 1626

Feature values:


Unnamed: 0,HomeTeam,AwayTeam,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,...,HomeTeam_HomeGoalsRatio,AwayTeam_AwayGoalsRatio,HomeTeam_HomeWinsRatio,AwayTeam_AwayWinsRatio,EPL,HomeTeam_CupGamesLastMonth,AwayTeam_CupGamesLastMonth,HomeTeam_Morale,AwayTeam_Morale,Curse_Parameter
0,67,170,2,0,3,140,17,8,14,4,...,0.62,0.512195,0.785714,0.5,1,0,0,0,0,0
1,68,305,1,0,3,66,17,12,10,5,...,0.647059,0.466667,0.764706,0.4,1,0,0,0,0,0
2,75,175,1,1,2,20,6,16,3,9,...,0.388889,0.590909,0.5,0.555556,1,0,0,0,0,0
3,81,258,1,2,1,13,6,13,4,6,...,0.621622,0.325,0.8,0.214286,1,0,0,0,0,0
4,140,100,2,0,3,47,17,12,8,6,...,0.5625,0.355556,0.55,0.454545,1,0,0,0,0,0


## Training and Evaluating Models

In [51]:
#for measuring training time
from time import time 
# F1 score (also F-score or F-measure) is a measure of a test's accuracy. 
#It considers both the precision p and the recall r of the test to compute 
#the score: p is the number of correct positive results divided by the number of 
#all positive results, and r is the number of correct positive results divided by 
#the number of positive results that should have been returned. The F1 score can be 
#interpreted as a weighted average of the precision and recall, where an F1 score 
#reaches its best value at 1 and worst at 0.
from sklearn.metrics import f1_score

def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    end = time()
    
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    
    return f1_score(target, y_pred, average='micro'), sum(target == y_pred) / float(len(y_pred))

def train_predict(clf, X_train, y_train, X_test, y_test):

    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
    f1, acc = predict_labels(clf, X_test, y_test)
    print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

## Make Prediction

In [52]:
# Initialize and train the NB model
clf_nb = GaussianNB()
clf_nb.fit(X_train, y_train)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(clf_nb, X_train, y_train, cv=5, scoring='accuracy')

# Print the accuracy scores for each fold
print("Cross-validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())

# Evaluate the model on the test set
train_predict(clf_nb, X_train, y_train, X_test, y_test)

Cross-validation Scores: [0.76701269 0.75923077 0.75884615 0.75615385 0.76923077]
Mean Accuracy: 0.7620948451778901
Training a GaussianNB using a training set size of 13001. . .
Trained model in 0.0715 seconds
Made predictions in 0.1145 seconds.
F1 score and accuracy score for training set: 0.7649 , 0.7649.
Made predictions in 0.0411 seconds.
F1 score and accuracy score for test set: 0.7731 , 0.7731.
