In [1]:
import pandas as pd
import numpy as np

df_start = pd.read_csv("EPL_Set.csv")

df_start.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9664 entries, 0 to 9663
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Div       9664 non-null   object 
 1   Date      9664 non-null   object 
 2   HomeTeam  9664 non-null   object 
 3   AwayTeam  9664 non-null   object 
 4   FTHG      9664 non-null   int64  
 5   FTAG      9664 non-null   int64  
 6   FTR       9664 non-null   object 
 7   HTHG      8740 non-null   float64
 8   HTAG      8740 non-null   float64
 9   HTR       8740 non-null   object 
 10  Season    9664 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 830.6+ KB


In [2]:
df_start.shape

(9664, 11)

In [3]:
df_start.tail()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Season
9659,E0,13/05/18,Newcastle,Chelsea,3,0,H,1.0,0.0,H,2017-18
9660,E0,13/05/18,Southampton,Man City,0,1,A,0.0,0.0,D,2017-18
9661,E0,13/05/18,Swansea,Stoke,1,2,A,1.0,2.0,A,2017-18
9662,E0,13/05/18,Tottenham,Leicester,5,4,H,1.0,2.0,A,2017-18
9663,E0,13/05/18,West Ham,Everton,3,1,H,1.0,0.0,H,2017-18


# DATA SPLITTING

In [4]:
from sklearn.model_selection import train_test_split

X = df_start.iloc[:, : -1]
y = df_start.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print("Dimensi X_train", X_train.shape)
print("Dimensi X_test", X_test.shape)
print("Dimensi y_train", y_train.shape)
print("Dimensi y_test", y_test.shape)

Dimensi X_train (6764, 10)
Dimensi X_test (2900, 10)
Dimensi y_train (6764,)
Dimensi y_test (2900,)


# DATA TRANSFORMING

## NORMALISASI

In [5]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()

data_normalisasi = df_start[["FTHG"]]
data_normalisasi.columns = ["FTHG"]

normalisasi = mms.fit_transform(data_normalisasi[["FTHG"]])

df_normalisasi = pd.DataFrame(normalisasi)
df_normalisasi.columns =["FTHG_Normalisasi"]

data_normalisasi = data_normalisasi.join(df_normalisasi["FTHG_Normalisasi"]).reset_index()
data_normalisasi = data_normalisasi.drop("index", axis=1)

data_normalisasi.tail()

Unnamed: 0,FTHG,FTHG_Normalisasi
9659,3,0.333333
9660,0,0.0
9661,1,0.111111
9662,5,0.555556
9663,3,0.333333


## STANDARISASI

In [6]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

df_start2 = df_start.select_dtypes(include='number')

std = ss.fit_transform(df_start2)
df_std = pd.DataFrame(std)
df_std.columns = ["FTHG_std", "FTAG_std", "HTHG_std", "HTAG_std"]

print("Perbandingan sebelum : ")
np.std(df_start2)


Perbandingan sebelum : 


FTHG    1.301604
FTAG    1.121609
HTHG    0.833533
HTAG    0.710483
dtype: float64

In [7]:
print("Perbandingan sesudah : ")
np.std(df_std)

Perbandingan sesudah : 


FTHG_std    1.0
FTAG_std    1.0
HTHG_std    1.0
HTAG_std    1.0
dtype: float64

# DATA CLEANING
## MENANGANI DATA NULL

In [8]:
df_start = pd.read_csv("EPL_Set.csv")

df_start.isna().sum()

Div           0
Date          0
HomeTeam      0
AwayTeam      0
FTHG          0
FTAG          0
FTR           0
HTHG        924
HTAG        924
HTR         924
Season        0
dtype: int64

### Substitusi nilai NULL

In [9]:
df_start['HTHG'] = df_start['HTHG'].fillna(df_start['HTHG'].median())

df_start.isna().sum()

Div           0
Date          0
HomeTeam      0
AwayTeam      0
FTHG          0
FTAG          0
FTR           0
HTHG          0
HTAG        924
HTR         924
Season        0
dtype: int64

In [10]:
from sklearn.impute import SimpleImputer
imputer_mean = SimpleImputer(strategy="mean")
imputer_modus = SimpleImputer(strategy="most_frequent")

df_start['HTAG'] = imputer_mean.fit_transform(df_start[['HTAG']])
df_start['HTR'] = imputer_modus.fit_transform(df_start[['HTR']])

df_start.isna().sum()

Div         0
Date        0
HomeTeam    0
AwayTeam    0
FTHG        0
FTAG        0
FTR         0
HTHG        0
HTAG        0
HTR         0
Season      0
dtype: int64

## MENANGANI NILAI DUPLIKAT

In [11]:
df_start[df_start.duplicated()]

print("Jumlah Data Duplikat : ")
df_start.duplicated().sum()

Jumlah Data Duplikat : 


0

### Karena tidak terdapat data duplikat kita akan membuatnya

In [12]:
df_duplikat = pd.concat([df_start]*2, ignore_index=True)
df_duplikat.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
19323     True
19324     True
19325     True
19326     True
19327     True
Length: 19328, dtype: bool

In [13]:
print("Jumlah Data Duplikat : ")
df_duplikat.duplicated().sum()

Jumlah Data Duplikat : 


9664

In [14]:
df_duplikat.drop_duplicates(inplace=True)

print("Jumlah Data Duplikat : ")
df_duplikat.duplicated().sum()

Jumlah Data Duplikat : 


0

# GANTI TIPE DATA

In [15]:
print("Sebelum : ", df_start["FTHG"].dtypes)
df_ubah = df_start[["FTHG"]].astype(float)
print("Sesudah : ", df_ubah["FTHG"].dtypes)

Sebelum :  int64
Sesudah :  float64


# ONE-HOT ENCODING

In [16]:
from sklearn.preprocessing import OneHotEncoder
oh_encoder = OneHotEncoder(sparse=False)

onehot = oh_encoder.fit_transform(df_start[["HTR"]])
df_onehot = pd.DataFrame(onehot)
df_last = df_start.join(df_onehot)

df_last.tail()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Season,0,1,2
9659,E0,13/05/18,Newcastle,Chelsea,3,0,H,1.0,0.0,H,2017-18,0.0,0.0,1.0
9660,E0,13/05/18,Southampton,Man City,0,1,A,0.0,0.0,D,2017-18,0.0,1.0,0.0
9661,E0,13/05/18,Swansea,Stoke,1,2,A,1.0,2.0,A,2017-18,1.0,0.0,0.0
9662,E0,13/05/18,Tottenham,Leicester,5,4,H,1.0,2.0,A,2017-18,1.0,0.0,0.0
9663,E0,13/05/18,West Ham,Everton,3,1,H,1.0,0.0,H,2017-18,0.0,0.0,1.0
