In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

attr_cols = ['crossing', 'finishing', 'heading_accuracy','short_passing', 'volleys', 'dribbling', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_positioning','gk_reflexes']

x_cols = [ "%s_%s"%(c,attr)  for c in ['h','a'] for attr in attr_cols]
x_norm_cols = [ "%s_%s"%(c,attr)  for c in ['h','a'] for attr in attr_cols if attr!='form']
y_cols = ['Result']

full_data = pd.read_csv('Full_Data_feats_ha.csv' )
#Finding misclassifications

test_data = pd.read_csv('Test_Data_feats.csv')

def misclassification_report(df, y_pred, y_test ):
    
    misclassified = pd.DataFrame()
    
    for count,value  in enumerate(y_test.iterrows()):
        
        if y_pred[count]!=int(value[1]['Result']):
            new_row = df.iloc[value[0]]
            new_row['Predicted'] = y_pred[count]
            misclassified = misclassified.append(new_row)
    return misclassified


In [29]:
def normalize(df):
    for key in df:
        if key in x_norm_cols:
            mn = df[key].min()
            mx = df[key].max()
            diff = mx - mn
            df[key] = df[key].apply(lambda x : (x-mn)/diff)
    return df

tr_data = normalize(full_data)
te_data = normalize(test_data)

#take difference and store data 

for i in attr_cols:
    tr_data['%s'%i] = tr_data["h_%s"%i] - tr_data["a_%s"%i]
    
for i in attr_cols:
    te_data['%s'%i] = te_data["h_%s"%i] - te_data["a_%s"%i]
    
tr_data_diff = tr_data[attr_cols]

In [60]:
import numpy as np
#One hot encoding the y values 
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder(sparse=False)
results = np.array(full_data['Result'])
y_all = onehot_encoder.fit_transform(results.reshape(-1,1))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [61]:
x_train = tr_data_diff[attr_cols]
y_train = y_all
x_test = te_data[attr_cols]
print(y_train)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 ...
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]


In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

def create_model(x_train,y_train):
    """ Creates a Sequential model made out of Densely connected Neural Network layers. All parameters are configured as per specification for Q4"""

    model = Sequential()
    #Input Layer with 61 inputs corresponding to 61 features 
    model.add(Dense(100,input_dim = 29, activation = 'relu'))

    model.add(Dense(500,activation = 'relu'))
    model.add(Dense(200,activation = 'relu'))
    model.add(Dense(3,activation = 'relu'))
    
    #Compiling the model with appropriate parameters
    model.compile(loss = 'mse', optimizer ='adam',metrics = ['accuracy'])

    result = model.fit(x_train,y_train,epochs = 50, batch_size = 100)

    #Store the model, its average training accuracy and validation accuracy over 10 epochs
    return model

#Training neural network
print(len(x_train),len(x_test))
model = create_model(x_train,y_train)
pred_y = model.predict(x_test)
#score = model.evaluate(x_test,y_test)
#print(score)
print(pred_y)

In [None]:
# #Training linear regression on the goal difference 
# from sklearn.linear_model import LinearRegression
# import math
# lr = LinearRegression()
# lr.fit(x_train,y_train['Goal_Diff'])
# y_pred = lr.predict(x_test[x_cols])
# goals = [math.ceil(value) for value in y_pred]

# pred_res = [1 if value > 0 else (-1 if value < 0 else 0) for value in goals]
# z = accuracy_score(y_test['Result'],pred_res)
# print(z)

In [69]:
inv = onehot_encoder.inverse_transform(pred_y)
print(pred_y)
#print(inv.reshape(1,-1)[0])

[[0.7640774  0.         0.        ]
 [0.         0.         0.6508661 ]
 [1.043925   0.         0.        ]
 ...
 [0.2589697  0.         0.        ]
 [1.6375492  0.         0.        ]
 [0.         0.         0.80708086]]


In [58]:
#Train SVM
from sklearn.svm import LinearSVC, SVC

svm = LinearSVC(C = 0.25, loss = 'hinge',penalty = 'l2')
svm.fit(x_train,y_train)
# print(svm.coef_)
# print(svm.intercept_)
y_pred_svm = svm.predict(x_test)
#y =accuracy_score(y_test,y_pred)
print(y_pred)
#mc = misclassification_report(full_data,y_pred,y_test)
#print(mc[x_cols + y_cols + ['Predicted']])


[1 3 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 3 1 1 1 3 1 1 1 1 1 1 3 1
 2 3 3 1 1 1 1 1 3 3 1 1 1 1 1 2 1 1 1 1 3 3 1 3 1 1 3 1 3 1 1 1 1 1 1 3 1
 1 1 1 3 3 1 1 1 1 1 1 3 1 3 1 1 1 1 1 1 1 1 3 3 1 1 3 2 1 1 1 1 1 1 1 1 1
 1 1 1 1 3 1 1 1 1 1 1 1 1 1 3 1 1 1 1 3 1 1 1 1 1 3 1 3 1 1 1 1 3 3 3 1 1
 1 1 3 1 1 1 1 1 1 3 3 1 1 3 1 1 1 1 1 3 3 1 1 1 1 1 1 1 1 1 2 3 3 1 1 1 1
 1 1 1 3 1 1 1 1 1 1 1 3 1 3 1 1 1 3 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 3 3 1 3 1 1 1 1 1 3 1 1 1 1 1 1 1 3 1 1 1 3 1 1 3 1 1 1 1 1 1
 1 1 3 3 1 1 2 1 1 1 1 1 1 3 1 3 1 1 1 1 1 3 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 3 1 1 1 3 3 1 1 1 1 1 1 1 3 3 1 3 1 1 1 1 1 1 1 3 1 1 3 1 1 1 1 1 3
 1 1 1 1 3 1 1 1 1 3 1 2 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 3 1 1 1 3 1 3 1 1
 2 1 1 1 1 1 1 1 1 1]


  y = column_or_1d(y, warn=True)


In [None]:
#Training decision tree
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

dt = DecisionTreeClassifier(criterion = 'gini', max_depth = 4)

dt.fit(x_train,y_train)

y_pred = dt.predict(x_test)

#y_test_synth = [-1 if total < 0 else (1 if total > 0 else 0) for total in x_test['Total'].values] 

a = accuracy_score(y_test,y_pred)
print(a)
# print(te_y.values)
#mc = misclassification_report(full_data, y_pred,y_test)
#print(mc)

In [None]:
from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier(n_neighbors=8)

knn.fit(x_train,y_train)

knn_pred = knn.predict(x_test)
ak = accuracy_score(x_test,knn_pred)

print(ak)


In [40]:
print(y_pred_svm)

[1 3 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 3 1 1 1 3 1 1 1 1 1 1 3 1
 2 3 3 1 1 1 1 1 3 3 1 1 1 1 1 2 1 1 1 1 3 3 1 3 1 1 3 1 3 1 1 1 1 1 1 3 1
 1 1 1 3 3 1 1 1 1 1 1 3 1 3 1 1 1 1 1 1 1 1 3 3 1 1 3 2 1 1 1 1 1 1 1 1 1
 1 1 1 1 3 1 1 1 1 1 1 1 1 1 3 1 1 1 1 3 1 1 1 1 1 3 1 3 1 1 1 1 3 3 3 1 1
 1 1 3 1 1 1 1 1 1 3 3 1 1 3 1 1 1 1 1 3 3 1 1 1 1 1 1 1 1 1 2 3 3 1 1 1 1
 1 1 1 3 1 1 1 1 1 1 1 3 1 3 1 1 1 3 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 3 3 1 3 1 1 1 1 1 3 1 1 1 1 1 1 1 3 1 1 1 3 1 1 3 1 1 1 1 1 1
 1 1 3 3 1 1 2 1 1 1 1 1 1 3 1 3 1 1 1 1 1 3 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 3 1 1 1 3 3 1 1 1 1 1 1 1 3 3 1 3 1 1 1 1 1 1 1 3 1 1 3 1 1 1 1 1 3
 1 1 1 1 3 1 1 1 1 3 1 2 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 3 1 1 1 3 1 3 1 1
 2 1 1 1 1 1 1 1 1 1]


In [67]:
norm_team_name = {}
norm_team_name['Man Utd'] = 'Manchester United'
norm_team_name['Newcastle'] = 'Newcastle United'
norm_team_name['Huddersfield'] = 'Huddersfield Town'
norm_team_name['Wolves'] = 'Wolverhampton Wanderers'
norm_team_name['Cardiff'] = 'Cardiff City'
norm_team_name['Leicester'] = 'Leicester City'
norm_team_name['Spurs'] = 'Tottenham Hotspur'
norm_team_name['West Ham'] = 'West Ham United'
norm_team_name['Brighton'] = 'Brighton & Hove Albion'
norm_team_name['Man City'] = 'Manchester City'
norm_team_name['West Ham'] = 'West Ham United'
norm_team_name['Bournemouth'] = 'Bournemouth'

def norm_player_names(all_players_info):
    
    for key,value in norm_team_name.items():
        all_players_info.loc[lambda df: df['Home Team'] == key, 'Home Team'] = value
        all_players_info.loc[lambda df: df['Away Team'] == key, 'Away Team'] = value
    
    return all_players_info


def get_fixtures(year):
    
    fixtures = pd.read_csv('fifa-19-player-database/epl-2018-GMTStandardTime.csv', encoding = "ISO-8859-1")
    fixtures = norm_player_names(fixtures)
    
    return fixtures
    

def generate_final_table(fixtures, results):
    team_points = {}

    for i in range(10):
        team_points[fixtures.iloc[i]['Home Team']] = 0
        team_points[fixtures.iloc[i]['Away Team']] = 0

    for i in range(380):
        result = results[i]
        fixture = fixtures.iloc[i]
        if(result == 1):
            team_points[fixture['Home Team']] += 3
        elif(result == 2):
            team_points[fixture['Home Team']] += 1
            team_points[fixture['Away Team']] += 1
        elif(result == 3):
            team_points[fixture['Away Team']] += 3

    sorted_team_points = sorted(team_points.items(), key=lambda kv: kv[1])
    sorted_team_points.reverse()
    return dict(sorted_team_points)


fixtures = get_fixtures('2019')

final_table = pd.DataFrame.from_dict(generate_final_table(fixtures, inv.reshape(1,-1)[0]), orient = 'index')
final_table

Unnamed: 0,0
Tottenham Hotspur,102
Chelsea,99
Manchester City,93
Arsenal,84
Liverpool,78
Brighton & Hove Albion,78
Wolverhampton Wanderers,63
Manchester United,60
Leicester City,57
Fulham,51
