# 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# 2. Read Dataset

In [3]:
df = pd.read_csv('../data/input/input_dataset.csv')

print('Before dropping rows with NA values \t: ', df.shape)
df = df.dropna(axis=0).reset_index(drop=True)
print('After dropping rows with NA values \t: ', df.shape)

Before dropping rows with NA values 	:  (3297, 31)
After dropping rows with NA values 	:  (3297, 31)


In [4]:
df.Result.value_counts()

W    1496
L    1029
D     772
Name: Result, dtype: int64

In [5]:
df.head()

Unnamed: 0,Referee,Day Of Week,Round,Days,Shots,ShotsOT,Corners,Fouls,YCards,RCards,GoalsScored,GoalsConceded,GoalsDiff,HTGoalsScored,HTGoalsConceded,HTGoalsDiff,Points,WinPercent,WinStreak,UnbPercent,UnbStreak,Def,Mid,Att,Ovr,LastSeasonRank,PromotedMatchup,WinnerOdd,DrawOdd,Venue,Result
0,L Probert,Sat,1.0,-4,10.4,6.6,3.8,-1.8,0.0,0.0,0.8,0.4,2.0,0.0,0.4,-2.0,0.4,20.0,0.0,0.0,1.0,7,8,8,7.67,-12,0,-9.71,5.5,Etihad Stadium,W
1,M Halsey,Sat,1.0,1,-1.0,-0.2,0.0,-1.8,0.4,0.0,0.2,0.0,1.0,0.4,0.2,1.0,-0.2,0.0,0.0,-20.0,-1.0,1,1,-2,0.0,1,0,-2.25,3.3,Bet365 Stadium,D
2,L Mason,Sat,1.0,0,6.8,5.0,1.8,-0.2,0.4,0.2,1.6,0.6,5.0,1.4,1.0,2.0,0.4,0.0,0.0,40.0,7.0,3,0,-1,0.67,-6,0,-6.56,4.2,Goodison Park,W
3,L Probert,Sun,1.0,0,3.4,2.2,2.0,0.2,-0.6,0.0,1.8,0.6,6.0,0.4,0.4,0.0,1.0,20.0,0.0,60.0,5.0,1,2,1,1.34,-4,0,-3.58,3.75,Craven Cottage,L
4,P Dowd,Sun,1.0,-1,-2.4,-1.8,-0.6,0.6,-1.0,0.0,-0.4,-1.2,4.0,0.2,0.0,1.0,1.0,20.0,0.0,60.0,6.0,-1,-3,-5,-3.0,-1,0,-1.5,3.3,Carrow Road,W


# 3. Splitting to Train and Test Set

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA

## 3.1 Data Split 1

- Target Variable: Draw & Non-Draw
- Sampling: Stratified

In [7]:
# separate draws and non_draws into draw_df and non_draw_df
draw_df = df[df['Result'] == 'D']
non_draw_df = df[(df['Result'] == 'W') | (df['Result'] == 'L')]
non_draw_df['Result'] = non_draw_df['Result'].map({"W": "ND", "L": "ND"})

print(draw_df.Result.value_counts())
print(non_draw_df.Result.value_counts())

D    772
Name: Result, dtype: int64
ND    2525
Name: Result, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_draw_df['Result'] = non_draw_df['Result'].map({"W": "ND", "L": "ND"})


In [8]:
# Setting up 5 fold_df
splits = [pd.DataFrame(columns=list(df.columns)) for i in range(5)]

In [9]:
# Setting up draw_df

# shuffle draw_df
draw_df = draw_df.sample(frac = 1, random_state=0)

split_sizes = [155, 155, 154, 154, 154]
for i in range(5):
    # copy section of draw_df to fold_df
    splits[i] = pd.concat([splits[i], draw_df.iloc[:split_sizes[i], :]])
    draw_df = draw_df.drop(draw_df.index[:split_sizes[i]])

In [10]:
# Setting up non_draw_df

# shuffle non_draw df
non_draw_df = non_draw_df.sample(frac=1, random_state=0)

split_sizes = [505, 505, 505, 505, 505]
for i in range(5):
    # copy section of non_draw_df to fold_df
    splits[i] = pd.concat([splits[i], non_draw_df.iloc[:split_sizes[i], :]])
    non_draw_df = non_draw_df.drop(non_draw_df.index[:split_sizes[i]])

In [11]:
# save to csv files
for i in range(5):
    # prepare test set
    test_df = splits[i]
    test_df = test_df.sample(frac=1, random_state=0).reset_index(drop=True) # randomly shuffle test_df
    test_df.to_csv(f'data-split-1/fold-{i+1}/test.csv', index=False) # save to test.csv
    
    # prepare train set
    train_df = pd.DataFrame(columns=list(df.columns))
    for j in range(5):
        if (j != i):
            train_df = pd.concat([train_df, splits[j]])
    
    train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True) # randomly shuffle train_df
    train_df.to_csv(f'data-split-1/fold-{i+1}/train.csv', index=False) # save to train.csv

## 3.2 Data Split 2

- Target Variable: Draw & Non-Draw
- Sampling: Balanced

In [12]:
# separate draws and non_draws into draw_df and non_draw_df
draw_df = df[df['Result'] == 'D']
non_draw_df = df[(df['Result'] == 'W') | (df['Result'] == 'L')]
non_draw_df['Result'] = non_draw_df['Result'].map({"W": "ND", "L": "ND"})

print(draw_df.Result.value_counts())
print(non_draw_df.Result.value_counts())

D    772
Name: Result, dtype: int64
ND    2525
Name: Result, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_draw_df['Result'] = non_draw_df['Result'].map({"W": "ND", "L": "ND"})


In [13]:
d_range = [(0, 618), (38, 656), (76, 694), (114, 732), (154, 772)]
nd_range = [(0, 618), (477, 1095), (954, 1572), (1431, 2049), (1907, 2525)]

for i in range(5):
    test_df = pd.DataFrame(columns=list(df.columns))
    train_df = pd.DataFrame(columns=list(df.columns))
    
    # prepare d_df and nd_df
    d_df = draw_df.sample(frac = 1, random_state=0)
    nd_df = non_draw_df.sample(frac=1, random_state=0)
    
    # copy section of d_df to train set
    d_start = d_range[i][0]
    d_end = d_range[i][1]
    train_df = pd.concat([train_df, d_df.iloc[d_start:d_end, :]])
    d_df = d_df.drop(d_df.index[d_start:d_end])
    
    # copy section of nd_df to train set
    nd_start = nd_range[i][0]
    nd_end = nd_range[i][1]
    train_df = pd.concat([train_df, nd_df.iloc[nd_start:nd_end, :]])
    nd_df = nd_df.drop(nd_df.index[nd_start:nd_end])
    
    # prepare test set
    test_df = pd.concat([test_df, d_df])
    test_df = pd.concat([test_df, nd_df])
    
    # save to csv files
    train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True) # randomly shuffle train_df
    train_df.to_csv(f'data-split-2/fold-{i+1}/train.csv', index=False)
    
    test_df = test_df.sample(frac=1, random_state=0).reset_index(drop=True) # randomly shuffle test_df
    test_df.to_csv(f'data-split-2/fold-{i+1}/test.csv', index=False)

## 3.3 Data Split 3

- Target Variable: Win, Draw, Lose
- Sampling: Stratified

In [7]:
# separate wins, draws, lose into win_df, draw_df, lose_df
win_df = df[df['Result'] == 'W']
draw_df = df[df['Result'] == 'D']
lose_df = df[df['Result'] == 'L']

print(win_df.Result.value_counts())
print(draw_df.Result.value_counts())
print(lose_df.Result.value_counts())

W    1496
Name: Result, dtype: int64
D    772
Name: Result, dtype: int64
L    1029
Name: Result, dtype: int64


In [8]:
# Setting up 5 fold_df
splits = [pd.DataFrame(columns=list(df.columns)) for i in range(5)]

In [9]:
# Setting up win_df
win_df = win_df.sample(frac = 1, random_state=0)

split_sizes = [299, 299, 299, 299, 300]

for i in range(5):
    # copy section of win_df to fold_df
    splits[i] = pd.concat([splits[i], win_df.iloc[:split_sizes[i], :]])
    win_df = win_df.drop(win_df.index[:split_sizes[i]])

In [10]:
# Setting up draw_df
draw_df = draw_df.sample(frac = 1, random_state=0)

split_sizes = [155, 155, 154, 154, 154]

for i in range(5):
    # copy section of draw_df to fold_df
    splits[i] = pd.concat([splits[i], draw_df.iloc[:split_sizes[i], :]])
    draw_df = draw_df.drop(draw_df.index[:split_sizes[i]])

In [11]:
# Setting up lose_df
lose_df = lose_df.sample(frac = 1, random_state=0)

split_sizes = [206, 206, 206, 206, 205]

for i in range(5):
    # copy section of lose_df to fold_df
    splits[i] = pd.concat([splits[i], lose_df.iloc[:split_sizes[i], :]])
    lose_df = lose_df.drop(lose_df.index[:split_sizes[i]])

In [12]:
# save to csv files
for i in range(5):
    # prepare test set
    test_df = splits[i]
    test_df = test_df.sample(frac=1, random_state=0).reset_index(drop=True) # randomly shuffle test_df
    test_df.to_csv(f'data-split-3/fold-{i+1}/test.csv', index=False)
    
    # prepare train set
    train_df = pd.DataFrame(columns=list(df.columns))
    for j in range(5):
        if (j != i):
            train_df = pd.concat([train_df, splits[j]])
    
    train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True) # randomly shuffle train_df
    train_df.to_csv(f'data-split-3/fold-{i+1}/train.csv', index=False)

## 3.4 Data Split 4

- Target Variable: Win, Draw, Lose
- Sampling: Balanced

In [20]:
# separate wins, draws, lose into win_df, draw_df, lose_df
win_df = df[df['Result'] == 'W']
draw_df = df[df['Result'] == 'D']
lose_df = df[df['Result'] == 'L']

print(win_df.Result.value_counts())
print(draw_df.Result.value_counts())
print(lose_df.Result.value_counts())

W    1496
Name: Result, dtype: int64
D    772
Name: Result, dtype: int64
L    1029
Name: Result, dtype: int64


In [21]:
w_range = [(0, 618), (219, 837), (438, 1056), (657, 1275), (878, 1496)]
d_range = [(0, 618), (38, 656), (76, 694), (114, 732), (154, 772)]
l_range = [(0, 618), (103, 721), (206, 824), (309, 927), (411, 1029)]

for i in range(5):
    test_df = pd.DataFrame(columns=list(df.columns))
    train_df = pd.DataFrame(columns=list(df.columns))
    
    # prepare d_df and nd_df
    w_df = win_df.sample(frac = 1, random_state=0)
    d_df = draw_df.sample(frac=1, random_state=0)
    l_df = lose_df.sample(frac=1, random_state=0)
    
    # copy section of w_df to train set
    w_start = w_range[i][0]
    w_end = w_range[i][1]
    train_df = pd.concat([train_df, w_df.iloc[w_start:w_end, :]])
    w_df = w_df.drop(w_df.index[w_start:w_end])
    
    # copy section of d_df to train set
    d_start = d_range[i][0]
    d_end = d_range[i][1]
    train_df = pd.concat([train_df, d_df.iloc[d_start:d_end, :]])
    d_df = d_df.drop(d_df.index[d_start:d_end])
    
    # copy section of l_df to train set
    l_start = l_range[i][0]
    l_end = l_range[i][1]
    train_df = pd.concat([train_df, l_df.iloc[l_start:l_end, :]])
    l_df = l_df.drop(l_df.index[l_start:l_end])
    
    # prepare test set
    test_df = pd.concat([test_df, w_df])
    test_df = pd.concat([test_df, d_df])
    test_df = pd.concat([test_df, l_df])
    
    # save to csv files
    train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True) # randomly shuffle train_df
    train_df.to_csv(f'data-split-4/fold-{i+1}/train.csv', index=False)
    
    test_df = test_df.sample(frac=1, random_state=0).reset_index(drop=True) # randomly shuffle test_df
    test_df.to_csv(f'data-split-4/fold-{i+1}/test.csv', index=False)

## 3.5 Data Split 5

- Target Variable: Draw, Lose
- Sampling: Stratified

In [7]:
# separate draws, lose into draw_df, lose_df
draw_df = df[df['Result'] == 'D']
lose_df = df[df['Result'] == 'L']

print(draw_df.Result.value_counts())
print(lose_df.Result.value_counts())

D    772
Name: Result, dtype: int64
L    1029
Name: Result, dtype: int64


In [8]:
# Setting up 5 fold_df
splits = [pd.DataFrame(columns=list(df.columns)) for i in range(5)]

In [9]:
# Setting up draw_df
draw_df = draw_df.sample(frac = 1, random_state=0)

split_sizes = [155,155,154,154,154]

for i in range(5):
    # copy section of draw_df to fold_df
    splits[i] = pd.concat([splits[i], draw_df.iloc[:split_sizes[i], :]])
    draw_df = draw_df.drop(draw_df.index[:split_sizes[i]])

In [10]:
# Setting up lose_df
lose_df = lose_df.sample(frac = 1, random_state=0)

split_sizes = [206,206,206,206,205]

for i in range(5):
    # copy section of lose_df to fold_df
    splits[i] = pd.concat([splits[i], lose_df.iloc[:split_sizes[i], :]])
    lose_df = lose_df.drop(lose_df.index[:split_sizes[i]])

In [11]:
# save to csv files
for i in range(5):
    # prepare test set
    test_df = splits[i]
    test_df = test_df.sample(frac=1, random_state=0).reset_index(drop=True) # randomly shuffle test_df
    test_df.to_csv(f'data-split-5/fold-{i+1}/test.csv', index=False)
    
    # prepare train set
    train_df = pd.DataFrame(columns=list(df.columns))
    for j in range(5):
        if (j != i):
            train_df = pd.concat([train_df, splits[j]])
    
    train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True) # randomly shuffle train_df
    train_df.to_csv(f'data-split-5/fold-{i+1}/train.csv', index=False)

## 3.6 Data Split 6

- Target Variable: Draw & Lose
- Sampling: Balanced

In [27]:
# separate draws, lose into draw_df, lose_df
draw_df = df[df['Result'] == 'D']
lose_df = df[df['Result'] == 'L']

print(draw_df.Result.value_counts())
print(lose_df.Result.value_counts())

D    772
Name: Result, dtype: int64
L    1029
Name: Result, dtype: int64


In [28]:
d_range = [(0, 618), (38, 656), (76, 694), (114, 732), (154, 772)]
l_range = [(0, 618), (103, 721), (206, 824), (309, 927), (411, 1029)]

for i in range(5):
    test_df = pd.DataFrame(columns=list(df.columns))
    train_df = pd.DataFrame(columns=list(df.columns))
    
    # prepare d_df and nd_df
    d_df = draw_df.sample(frac=1, random_state=0)
    l_df = lose_df.sample(frac=1, random_state=0)
    
    # copy section of d_df to train set
    d_start = d_range[i][0]
    d_end = d_range[i][1]
    train_df = pd.concat([train_df, d_df.iloc[d_start:d_end, :]])
    d_df = d_df.drop(d_df.index[d_start:d_end])
    
    # copy section of l_df to train set
    l_start = l_range[i][0]
    l_end = l_range[i][1]
    train_df = pd.concat([train_df, l_df.iloc[l_start:l_end, :]])
    l_df = l_df.drop(l_df.index[l_start:l_end])
    
    # prepare test set
    test_df = pd.concat([test_df, d_df])
    test_df = pd.concat([test_df, l_df])
    
    # save to csv files
    train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True) # randomly shuffle train_df
    train_df.to_csv(f'data-split-6/fold-{i+1}/train.csv', index=False)
    
    test_df = test_df.sample(frac=1, random_state=0).reset_index(drop=True) # randomly shuffle test_df
    test_df.to_csv(f'data-split-6/fold-{i+1}/test.csv', index=False)

## 3.7 Data Split 7

- Target Variable: Win & Non-Win
- Sampling: Stratified

In [29]:
# separate wins and non_wins into win_df and non_win_df
win_df = df[df['Result'] == 'W']
non_win_df = df[(df['Result'] == 'D') | (df['Result'] == 'L')]
non_win_df['Result'] = non_win_df['Result'].map({"D": "NW", "L": "NW"})

print(win_df.Result.value_counts())
print(non_win_df.Result.value_counts())

W    1496
Name: Result, dtype: int64
NW    1801
Name: Result, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_win_df['Result'] = non_win_df['Result'].map({"D": "NW", "L": "NW"})


In [30]:
# Setting up 5 fold_df
splits = [pd.DataFrame(columns=list(df.columns)) for i in range(5)]

In [31]:
# Setting up win_df

# shuffle win_df
win_df = win_df.sample(frac = 1, random_state=0)

split_sizes = [299,299,299,299,300]
for i in range(5):
    # copy section of win_df to fold_df
    splits[i] = pd.concat([splits[i], win_df.iloc[:split_sizes[i], :]])
    win_df = win_df.drop(win_df.index[:split_sizes[i]])

In [32]:
# Setting up non_win_df

# shuffle non_win_df
non_win_df = non_win_df.sample(frac=1, random_state=0)

split_sizes = [361,360,360,360,360]
for i in range(5):
    # copy section of non_win_df to fold_df
    splits[i] = pd.concat([splits[i], non_win_df.iloc[:split_sizes[i], :]])
    non_win_df = non_win_df.drop(non_win_df.index[:split_sizes[i]])

In [33]:
# save to csv files
for i in range(5):
    # prepare test set
    test_df = splits[i]
    test_df = test_df.sample(frac=1, random_state=0).reset_index(drop=True) # randomly shuffle test_df
    test_df.to_csv(f'data-split-7/fold-{i+1}/test.csv', index=False) # save to test.csv
    
    # prepare train set
    train_df = pd.DataFrame(columns=list(df.columns))
    for j in range(5):
        if (j != i):
            train_df = pd.concat([train_df, splits[j]])
    
    train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True) # randomly shuffle train_df
    train_df.to_csv(f'data-split-7/fold-{i+1}/train.csv', index=False) # save to train.csv

## 3.8 Data Split 8

- Target Variable: Win & Non-Win
- Sampling: Balanced

In [7]:
# separate wins and non_wins into win_df and non_win_df
win_df = df[df['Result'] == 'W']
non_win_df = df[(df['Result'] == 'D') | (df['Result'] == 'L')]
non_win_df['Result'] = non_win_df['Result'].map({"D": "NW", "L": "NW"})

print(win_df.Result.value_counts())
print(non_win_df.Result.value_counts())

W    1496
Name: Result, dtype: int64
NW    1801
Name: Result, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_win_df['Result'] = non_win_df['Result'].map({"D": "NW", "L": "NW"})


In [8]:
w_range = [(0, 1196), (75, 1271), (150, 1346), (225, 1421), (300, 1496)]
nw_range = [(0, 1196), (151, 1347), (302, 1498), (453, 1649), (605, 1801)]

for i in range(5):
    test_df = pd.DataFrame(columns=list(df.columns))
    train_df = pd.DataFrame(columns=list(df.columns))
    
    # prepare w_df and nw_df
    w_df = win_df.sample(frac = 1, random_state=0)
    nw_df = non_win_df.sample(frac=1, random_state=0)
    
    # copy section of d_df to train set
    w_start = w_range[i][0]
    w_end = w_range[i][1]
    train_df = pd.concat([train_df, w_df.iloc[w_start:w_end, :]])
    w_df = w_df.drop(w_df.index[w_start:w_end])
    
    # copy section of nd_df to train set
    nw_start = nw_range[i][0]
    nw_end = nw_range[i][1]
    train_df = pd.concat([train_df, nw_df.iloc[nw_start:nw_end, :]])
    nw_df = nw_df.drop(nw_df.index[nw_start:nw_end])
    
    # prepare test set
    test_df = pd.concat([test_df, w_df])
    test_df = pd.concat([test_df, nw_df])
    
    # save to csv files
    train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True) # randomly shuffle train_df
    train_df.to_csv(f'data-split-8/fold-{i+1}/train.csv', index=False)
    
    test_df = test_df.sample(frac=1, random_state=0).reset_index(drop=True) # randomly shuffle test_df
    test_df.to_csv(f'data-split-8/fold-{i+1}/test.csv', index=False)