https://www.pluralsight.com/guides/classification-keras

In [1]:
# Import required libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import sklearn
# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
# Keras specific
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical 

In [2]:
FOLDER = '../data/'
FILENAME = 'diabetes.csv'

df = pd.read_csv(f'{FOLDER}{FILENAME}', delimiter=',')
df.shape

(768, 9)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
target_column = ['Outcome']
predictors = list(set(list(df.columns))-set(target_column))
df[predictors] = df[predictors]/df[predictors].max()
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,0.22618,0.60751,0.566438,0.207439,0.094326,0.47679,0.19499,0.410381,0.348958
std,0.19821,0.160666,0.158654,0.161134,0.136222,0.117499,0.136913,0.145188,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.032231,0.259259,0.0
25%,0.058824,0.497487,0.508197,0.0,0.0,0.406855,0.100723,0.296296,0.0
50%,0.176471,0.58794,0.590164,0.232323,0.036052,0.4769,0.153926,0.358025,0.0
75%,0.352941,0.704774,0.655738,0.323232,0.150414,0.545455,0.258781,0.506173,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
X = df[predictors].values
y = df[target_column].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)
print(X_train.shape); print(X_test.shape)

(537, 8)
(231, 8)


In [7]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)



In [8]:
model = Sequential()
model.add(Dense(500, activation='relu', input_dim=8))
model.add(Dense(100, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

2022-05-24 13:13:04.779950: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
model.fit(X_train, y_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7feda2e94790>

In [10]:
pred_train = model.predict(X_train)
scores = model.evaluate(X_train, y_train, verbose=0)
print(f'Accuracy on training {scores[1]}. Error is {1-scores[1]}')

pred_train = model.predict(X_test)
scores = model.evaluate(X_test, y_test, verbose=0)
print(f'Accuracy on test data {scores[1]}. Error is {1-scores[1]}')

Accuracy on training 0.80633145570755. Error is 0.19366854429244995
Accuracy on test data 0.7662337422370911. Error is 0.23376625776290894


In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 500)               4500      
                                                                 
 dense_1 (Dense)             (None, 100)               50100     
                                                                 
 dense_2 (Dense)             (None, 50)                5050      
                                                                 
 dense_3 (Dense)             (None, 2)                 102       
                                                                 
Total params: 59,752
Trainable params: 59,752
Non-trainable params: 0
_________________________________________________________________


In [12]:
from keras.utils.vis_utils import plot_model

In [13]:
import graphviz
import pydot

plot_model(model,  show_shapes=True, show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


!pip install pydot

!pip install graphviz

In [67]:
df = pd.read_csv(f'{FOLDER}game_odd_rank_fifa.csv')
df.shape

(4219, 22)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4219 entries, 0 to 4218
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   oddsportal           4219 non-null   object 
 1   matchday             4219 non-null   object 
 2   home_team            4219 non-null   object 
 3   away_team            4219 non-null   object 
 4   correct_score        4219 non-null   object 
 5   1N2_1                4219 non-null   float64
 6   1N2_N                4219 non-null   float64
 7   1N2_2                4219 non-null   float64
 8   betexplorer          4219 non-null   object 
 9   matchdate            4219 non-null   object 
 10  period               4219 non-null   object 
 11  location             4219 non-null   object 
 12  season               4219 non-null   int64  
 13  competition          4219 non-null   object 
 14  bet_won              4219 non-null   object 
 15  target               4219 non-null   f

In [69]:
df.competition.value_counts()

friendly-international          3426
euro                             508
world-cup                        257
concacaf-nations-league-2019      28
Name: competition, dtype: int64

In [70]:
data_cols = ['1N2_1','1N2_N','1N2_2','season','home_team_rank_FIFA','away_team_rank_FIFA','target','set']
df.loc[df.query('competition.str.contains("world-cup") and season == 2018', engine='python').index,'set']=1
df.set.fillna(0, inplace=True)
df = df[data_cols].copy()

# normalize the input dataset
target_column = ['target']
predictors = list(set(list(df.columns))-set(target_column))
df[predictors] = df[predictors]/df[predictors].max()
df.describe()

Unnamed: 0,1N2_1,1N2_N,1N2_2,season,home_team_rank_FIFA,away_team_rank_FIFA,target,set
count,4219.0,4219.0,4219.0,4219.0,4219.0,4219.0,4219.0,4219.0
mean,0.034896,0.098981,0.049557,0.996462,0.297752,0.324652,0.80256,0.015169
std,0.054354,0.066705,0.065775,0.002332,0.228665,0.2344,0.850143,0.122241
min,0.011181,0.044475,0.008315,0.991089,0.004759,0.004759,0.0,0.0
25%,0.016883,0.073361,0.019957,0.994554,0.10943,0.128892,0.0,0.0
50%,0.022809,0.078634,0.029935,0.996535,0.250446,0.279595,1.0,0.0
75%,0.032536,0.097432,0.053426,0.998515,0.441801,0.474321,2.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0


In [71]:
X_train = df.query('set==0')[predictors].values
y_train = df.query('set==0')[target_column].values

X_test = df.query('set==1')[predictors].values
y_test = df.query('set==1')[target_column].values

print(X_train.shape); print(X_test.shape)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)


(4155, 7)
(64, 7)


In [87]:
model = Sequential()
model.add(Dense(500, activation='relu', kernel_initializer='normal', input_dim=7))
model.add(Dense(100, kernel_initializer='normal', activation='relu'))
model.add(Dense(50,kernel_initializer='normal', activation='relu'))
model.add(Dense(3, activation='sigmoid'))


"""
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=7))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(3, activation='softmax'))

"""


# Compile the model
model.compile(optimizer= 
                        #keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999,
                        #    epsilon=None, decay=0.0, amsgrad=False),   
                        #keras.optimizers.SGD(lr=0.01, momentum=0.0, decay=0.0,nesterov=False), 
                        'adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=3, batch_size=32, verbose=0)

pred_train = model.predict(X_train)
scores = model.evaluate(X_train, y_train, verbose=0)
print(f'Accuracy on training {scores[1]}. Error is {1-scores[1]}')

pred_train = model.predict(X_test)
scores = model.evaluate(X_test, y_test, verbose=0)
print(f'Accuracy on test data {scores[1]}. Error is {1-scores[1]}')

Accuracy on training 0.5742478966712952. Error is 0.42575210332870483
Accuracy on test data 0.625. Error is 0.375


In [88]:

pred = model.predict(X_test)
bet=[]
for i in range(0,20):
    max_value= max(pred[i])
    max_index = pred[i].tolist().index(max_value)
    bet.append(max_index)



In [89]:
bet, pred[:20]

([2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 0],
 array([[0.39105406, 0.3831398 , 0.6033059 ],
        [0.40999997, 0.37364817, 0.5814457 ],
        [0.47690213, 0.37670645, 0.4927482 ],
        [0.3723271 , 0.38126913, 0.62560105],
        [0.5091024 , 0.38465336, 0.43806514],
        [0.19308363, 0.28539008, 0.84923583],
        [0.3270378 , 0.35496905, 0.6885232 ],
        [0.59908295, 0.36572278, 0.31316215],
        [0.4133009 , 0.37423033, 0.577288  ],
        [0.33645135, 0.36185965, 0.67449695],
        [0.42991713, 0.38340956, 0.5536659 ],
        [0.41013595, 0.370067  , 0.5841296 ],
        [0.3513771 , 0.36343822, 0.65783644],
        [0.6534503 , 0.347135  , 0.24896021],
        [0.4178929 , 0.39511985, 0.56113225],
        [0.5805729 , 0.37940904, 0.3345685 ],
        [0.4196472 , 0.3900519 , 0.5642143 ],
        [0.47000042, 0.3778634 , 0.5010997 ],
        [0.7378518 , 0.3033042 , 0.1584862 ],
        [0.63825285, 0.3471145 , 0.26657408]], dtype=float32))

In [90]:
y_test[:20]

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.]], dtype=float32)

In [23]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 500)               4000      
                                                                 
 dense_5 (Dense)             (None, 100)               50100     
                                                                 
 dense_6 (Dense)             (None, 50)                5050      
                                                                 
 dense_7 (Dense)             (None, 3)                 153       
                                                                 
Total params: 59,303
Trainable params: 59,303
Non-trainable params: 0
_________________________________________________________________
