# Predicting whether & how a Red Devil will play in the next match 

### importing packages

In [1]:
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans

%matplotlib inline

### importing dataset

In [2]:
data = pd.read_csv('Dataset_ManUtd_Player_Performance_2019-20.csv')

In [3]:
data.head()

Unnamed: 0,Last Name,First Name,ID,Age,Appearances this EPL Season,Appearances in last 5 matches,Goals Scored this EPL Season,Goals Assisted this EPL Season,Own Goals this EPL Season,Goals Scored in last 5 matches,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,Rashford,Marcus,1,22,12,5.0,6.0,3.0,,5.0,...,73.0,32.0,40.0,33.0,11.0,6.0,15.0,7.0,14.0,€55.4M
1,Wan Bissaka,Aaron,2,21,10,5.0,,,,,...,61.0,78.0,76.0,82.0,9.0,8.0,6.0,8.0,8.0,€16.8M
2,McTominay,Scott,3,22,12,5.0,3.0,1.0,,2.0,...,74.0,68.0,69.0,64.0,9.0,6.0,6.0,10.0,11.0,€7.8M
3,Pogba,Paul,4,26,5,,,2.0,,,...,87.0,66.0,70.0,68.0,5.0,6.0,2.0,4.0,3.0,€123.2M
4,Martial,Anthony,5,23,7,5.0,3.0,3.0,,2.0,...,79.0,38.0,39.0,36.0,9.0,8.0,8.0,15.0,11.0,€87.1M


In [4]:
data.describe()

Unnamed: 0,ID,Age,Appearances this EPL Season,Appearances in last 5 matches,Goals Scored this EPL Season,Goals Assisted this EPL Season,Own Goals this EPL Season,Goals Scored in last 5 matches,Goals Assited in last 5 matches,Own Goals in last 5 matches,...,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes
count,28.0,28.0,28.0,16.0,6.0,7.0,0.0,5.0,4.0,0.0,...,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0
mean,14.5,24.5,5.821429,3.875,3.0,1.857143,,2.2,1.75,,...,57.2,70.84,57.44,58.44,55.8,16.4,15.48,16.0,15.96,16.4
std,8.225975,4.434712,4.538343,1.454877,1.67332,0.899735,,1.643168,0.957427,,...,15.416441,10.69143,21.469125,21.777435,22.737634,20.645823,19.927201,20.812656,20.421557,21.817424
min,1.0,18.0,0.0,1.0,1.0,1.0,,1.0,1.0,,...,16.0,37.0,14.0,11.0,13.0,5.0,6.0,2.0,4.0,3.0
25%,7.75,21.75,1.75,3.0,2.25,1.0,,1.0,1.0,,...,48.0,67.0,42.0,40.0,36.0,9.0,7.0,8.0,7.0,8.0
50%,14.5,24.5,5.0,4.5,3.0,2.0,,2.0,1.5,,...,56.0,73.0,67.0,68.0,64.0,10.0,8.0,11.0,10.0,11.0
75%,21.25,27.0,10.25,5.0,3.0,2.5,,2.0,2.25,,...,66.0,76.0,76.0,76.0,76.0,14.0,15.0,14.0,15.0,14.0
max,28.0,34.0,12.0,5.0,6.0,3.0,,5.0,3.0,,...,81.0,87.0,81.0,84.0,82.0,90.0,85.0,87.0,88.0,94.0


# check for missing values

In [5]:
data.isna().head()

Unnamed: 0,Last Name,First Name,ID,Age,Appearances this EPL Season,Appearances in last 5 matches,Goals Scored this EPL Season,Goals Assisted this EPL Season,Own Goals this EPL Season,Goals Scored in last 5 matches,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,True,True,False,True,True,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


show how many missing values

In [6]:
data.isna().sum()

Last Name                      1
First Name                     0
ID                             0
Age                            0
Appearances this EPL Season    0
                              ..
GKHandling                     3
GKKicking                      3
GKPositioning                  3
GKReflexes                     3
Release Clause                 5
Length: 116, dtype: int64

# fill the missing values

renaming columns

In [7]:
data.columns = ['LastName', 'FirstName', 'ID', 'Age', 'AppsSeason', 'Apps5',
                'GoalsSeason', 'GoalsAssistedSeason', 'OGSeason', 'Goals5', 
                'GoalsAsst5', 'OG5', 'GoalsConcededSeason', 'CleanSheetsSeason', 
                'GoalsConceded5', 'CleanSheet5', 'SubsOnSeason', 'SubsOffSeason', 
                'SubsOn5', 'SubsOff5', 'Yellow1Season', 'Yellow2Season', 'RedSeason',
                'Yellow5', 'Red5', 'PenaltySeason', 'Penalty5', 'MinGoalSeason', 
                'MinPlayedSeason', 'Min5', 'Injury', 'FormSeason', 'Form5', 
                'Country', 'Overall', 'Potential','ClubNameLastSeason', 'ClubLogo', 
                'Value', 'Wage', 'Special', 'Foot', 'Rep', 'WeakFoot', 'SkillMoves', 
                'WorkRate', 'BodyType', 'RealFace', 'Position', 'JerseyNo', 
                'Joined', 'LoanedFrom', 'ContractDeadline', 'Height', 'Weight', 
                'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 
                'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 
                'LB', 'LCB', 'CB', 'RCB', 'RB', 'Crossing', 'Finishing', 'HeadingAccuracy',
                'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy',
                'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed',
                'Agility', 'Reactions', 'Balance', 'ShotPower', 'Jumping',
                'Stamina', 'Strength', 'LongShots', 'Aggression', 'Interceptions',
                'Positioning', 'Vision', 'Penalties', 'Composure', 'Marking',
                'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
                'GKKicking', 'GKPositioning', 'GKReflexes', 'ReleaseClause']
data.head()

Unnamed: 0,LastName,FirstName,ID,Age,AppsSeason,Apps5,GoalsSeason,GoalsAssistedSeason,OGSeason,Goals5,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,ReleaseClause
0,Rashford,Marcus,1,22,12,5.0,6.0,3.0,,5.0,...,73.0,32.0,40.0,33.0,11.0,6.0,15.0,7.0,14.0,€55.4M
1,Wan Bissaka,Aaron,2,21,10,5.0,,,,,...,61.0,78.0,76.0,82.0,9.0,8.0,6.0,8.0,8.0,€16.8M
2,McTominay,Scott,3,22,12,5.0,3.0,1.0,,2.0,...,74.0,68.0,69.0,64.0,9.0,6.0,6.0,10.0,11.0,€7.8M
3,Pogba,Paul,4,26,5,,,2.0,,,...,87.0,66.0,70.0,68.0,5.0,6.0,2.0,4.0,3.0,€123.2M
4,Martial,Anthony,5,23,7,5.0,3.0,3.0,,2.0,...,79.0,38.0,39.0,36.0,9.0,8.0,8.0,15.0,11.0,€87.1M


remove unnecessary columns

In [8]:
data = data.drop(['ClubNameLastSeason', 'ClubLogo', 'WeakFoot', 'RealFace', 'JerseyNo'], axis=1)

replace missing values with `0`

In [9]:
data.iloc[:, 4:33].fillna(0, inplace=True)

# scatter plot

In [None]:
def scatter_plot(xlabel, ylabel):
    plt.scatter(data[xlabel], data[ylabel])
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)

In [None]:
scatter_plot('Total Minutes Played this EPL Season', 'Form this EPL Season')

In [None]:
km = KMeans(n_clusters=4)
y_predicted = km.fit_predict(data[['Total Minutes Played this EPL Season','Form this EPL Season']])
y_predicted

In [None]:
data['cluster']=y_predicted
data.head()

In [None]:
km.cluster_centers_

In [None]:
xlabel = 'Total Minutes Played this EPL Season'
ylabel = 'Form this EPL Season'

df1 = data[data.cluster==0]
df2 = data[data.cluster==1]
df3 = data[data.cluster==2]
df4 = data[data.cluster==3]
plt.scatter(df1[xlabel],df1[ylabel],color='black')
plt.scatter(df2[xlabel],df2[ylabel],color='orange')
plt.scatter(df3[xlabel],df3[ylabel],color='red')
plt.scatter(df4[xlabel],df4[ylabel],color='green')
plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1],color='purple',marker='*',label='centroid')
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.legend()