In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# MENAMPILKAN DATA

In [80]:
datasplit = pd.read_csv("players.csv")
datasplit.head(3)

Unnamed: 0.1,Unnamed: 0,Name,Position,Age,Markey Value In Millions(£),Country,Club,Matches,Goals,Own Goals,Assists,Yellow Cards,Second Yellow Cards,Red Cards,Number Of Substitute In,Number Of Substitute Out
0,0,Kylian Mbappé,Centre-Forward,22.0,144.0,France,Paris Saint-Germain,16.0,7.0,0.0,,3.0,0.0,0.0,0.0,8.0
1,1,Erling Haaland,Centre-Forward,21.0,135.0,Norway,Borussia Dortmund,10.0,13.0,0.0,,1.0,0.0,0.0,0.0,1.0
2,2,Harry Kane,Centre-Forward,28.0,108.0,England,Tottenham Hotspur,16.0,7.0,0.0,2.0,2.0,0.0,0.0,2.0,2.0


# MELAKUKAN DATA SPLITTING x dan y

In [48]:
x = datasplit.iloc[:, :-1]
y = datasplit.iloc[:, -1]

# TRAINING SET 70:30

In [49]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

In [50]:
print("Dimensi x_train :", x_train.shape)
print("Dimensi x_test :", x_test.shape)
print("Dimensi y_train :", y_train.shape)
print("Dimensi y_test:", y_test.shape)

Dimensi x_train : (350, 15)
Dimensi x_test : (151, 15)
Dimensi y_train : (350,)
Dimensi y_test: (151,)


# Data cleaning pada data dengan nilai null

In [51]:
datasplit.isna().sum()

Unnamed: 0                     0
Name                           1
Position                       1
Age                            1
Markey Value In Millions(£)    1
Country                        1
Club                           1
Matches                        1
Goals                          1
Own Goals                      1
Assists                        4
Yellow Cards                   1
Second Yellow Cards            1
Red Cards                      1
Number Of Substitute In        1
Number Of Substitute Out       1
dtype: int64

# Data Cleaning membersihkan NULL pada atribut "Assists, Goals, Age".

In [52]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')

In [53]:
datasplit["Assists"] = imputer.fit_transform(datasplit[["Assists"]])

In [54]:
datasplit["Goals"] = imputer.fit_transform(datasplit[["Goals"]])

In [55]:
datasplit["Age"] = imputer.fit_transform(datasplit[["Age"]])

In [56]:
datasplit.isna().sum()

Unnamed: 0                     0
Name                           1
Position                       1
Age                            0
Markey Value In Millions(£)    1
Country                        1
Club                           1
Matches                        1
Goals                          0
Own Goals                      1
Assists                        0
Yellow Cards                   1
Second Yellow Cards            1
Red Cards                      1
Number Of Substitute In        1
Number Of Substitute Out       1
dtype: int64

# Data cleaning pada data dengan nilai duplikat.

In [None]:
datasplit[datasplit.duplicated()]

Unnamed: 0.1,Unnamed: 0,Name,Position,Age,Markey Value In Millions(£),Country,Club,Matches,Goals,Own Goals,Assists,Yellow Cards,Second Yellow Cards,Red Cards,Number Of Substitute In,Number Of Substitute Out
500,350,Mattéo Guendouzi,Central Midfield,22.0,19.8,France,Olympique Marseille,17.0,3.0,0.0,3.0,2.0,0.0,0.0,1.0,5.0


# normalisasi data pada salah satu attribute menggunakan Min Max scaler

In [59]:
datasplit2 = datasplit.copy()

In [65]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
X_datasplit2 = min_max_scaler.fit_transform(
    datasplit[['Assists', 'Goals','Age']]
)

data_normalization = pd.DataFrame(X_datasplit2)
data_normalization.head()

Unnamed: 0,0,1,2
0,0.124245,0.304348,0.3
1,0.124245,0.565217,0.25
2,0.166667,0.304348,0.6
3,0.25,0.086957,0.5
4,0.5,0.652174,0.65


# standarisasi pada dataset

In [66]:
from sklearn.preprocessing import StandardScaler

In [68]:
ss = StandardScaler()

X_datasplit2 = ss.fit_transform(
    datasplit[['Assists', 'Goals','Age']]
)

data_standarization = pd.DataFrame(X_datasplit2)
data_standarization.head()

Unnamed: 0,0,1,2
0,0.0,1.686754,-0.935637
1,0.0,3.776044,-1.252373
2,0.283247,1.686754,0.964777
3,0.839666,-0.054322,0.331306
4,2.508921,4.472474,1.281513


# Mengganti salah satu Atribut bertipe data angka

In [71]:
datasplit.dtypes
datasplit["Age"] = datasplit["Age"].astype("int")

In [73]:
datasplit.dtypes

Unnamed: 0                      object
Name                            object
Position                        object
Age                              int32
Markey Value In Millions(£)    float64
Country                         object
Club                            object
Matches                        float64
Goals                          float64
Own Goals                      float64
Assists                        float64
Yellow Cards                   float64
Second Yellow Cards            float64
Red Cards                      float64
Number Of Substitute In        float64
Number Of Substitute Out       float64
dtype: object

# Melakukan One Hot Encoding pada Dataset

In [74]:
from sklearn.preprocessing import OneHotEncoder

In [76]:
onehotencoder = OneHotEncoder(sparse=False)
VG_encoded2 = onehotencoder.fit_transform(datasplit[["Position"]])

datasplit2 = pd.DataFrame(VG_encoded2)
datasplit = datasplit.join(datasplit2)

In [78]:
datasplit.head()

Unnamed: 0.1,Unnamed: 0,Name,Position,Age,Markey Value In Millions(£),Country,Club,Matches,Goals,Own Goals,...,4,5,6,7,8,9,10,11,12,13
0,0,Kylian Mbappé,Centre-Forward,22,144.0,France,Paris Saint-Germain,16.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Erling Haaland,Centre-Forward,21,135.0,Norway,Borussia Dortmund,10.0,13.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Harry Kane,Centre-Forward,28,108.0,England,Tottenham Hotspur,16.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Jack Grealish,Left Winger,26,90.0,England,Manchester City,15.0,2.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,Mohamed Salah,Right Winger,29,90.0,Egypt,Liverpool FC,15.0,15.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
