# Sports Data Analysis using Deep Learning concepts

### USECASE 1 : Predicting Final Score of the match

### Importing needed libraries

In [None]:
import numpy as np 
import pandas as pd 

In [None]:
df = pd.read_csv('../input/ipl-cricket-dataset/deliveries.csv')

In [None]:
df.head(5)

### Factorizing all Categoical Variables

In [None]:
batting_team_factorized, batting_team_categories = pd.factorize(df['batting_team'])
df['batting_team'] = batting_team_factorized

bowling_team_factorized, bowling_team_categories = pd.factorize(df['bowling_team'])
df['bowling_team'] = bowling_team_factorized

batsman_factorized, batsman_categories = pd.factorize(df['batsman'])
df['batsman'] = batsman_factorized

bowler_factorized, bowler_categories = pd.factorize(df['bowler'])
df['bowler'] = bowler_factorized

non_striker_factorized, non_striker_categories = pd.factorize(df['non_striker'])
df['non_striker'] = non_striker_factorized

player_dismissed_factorized, player_dismissed_categories = pd.factorize(df['player_dismissed'])
df['player_dismissed'] = player_dismissed_factorized

### Grouping the data by match id and innings to find out total runs scored in each innings and merging it with original dataframe

In [None]:
df_Sum = df.groupby(['match_id', 'inning']).sum()

df_TotalRuns = df_Sum[['total_runs']]

df2 = pd.merge(df,df_TotalRuns, on=['match_id','inning'])

df2.head()

### Defining 4 new arrays with np.zeros. Now adding the logic for the innings which is being played and computing the wickets bowl bowled and total runs scored in that innnings

In [None]:
current_score = np.zeros(np.shape(batting_team_factorized)[0])
balls_bowled = np.zeros(np.shape(batting_team_factorized)[0])
wickets = np.zeros(np.shape(batting_team_factorized)[0])
final_score = np.zeros(np.shape(batting_team_factorized)[0])
current_inning = -1
for i,rVal in enumerate(df2['match_id']): 
    
    if df2['inning'][i] != current_inning:
        current_inning = df2['inning'][i]
        current_score[i] = df2['total_runs_x'][i]
        if df2['player_dismissed'][i]==-1:
            wickets[i] = 0
        else:
            wickets[i] = 1
            
    else:
        current_score[i] = df2['total_runs_x'][i] + current_score[i-1]
        if df2['player_dismissed'][i]!=-1:
            wickets[i] = wickets[i-1] + 1        
        else:
            wickets[i] = wickets[i-1]
    
    balls_bowled[i] = (df2['over'][i] * 6) + df2['ball'][i]
    final_score[i] = df2['total_runs_y'][i]

 ### Here we are making an array X which conist of current score, balls bowled, wickets taken this would be our training data and final score would be our target variable. Here training data has been reshaped to 3d for RNN model

In [None]:
X = np.zeros((np.shape(batting_team_factorized)[0],3))
X[:,0] = current_score
X[:,1] = balls_bowled
X[:,2] = wickets
print(X.shape)
print(final_score.shape)
print(type(X))
print(type(final_score))
print(X.shape)
print(X)
print(final_score)
X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
print(X.shape)
#print(final_score.shape)

### For the sequential model we have added a simple RNN layer and 2 dense layers where the last one acts as the output layer. Here for compilation we have used loss fuction as mean squared error and optimizer as adam. Further as it is a regression model we have used metrics as mse. (Without Regularization)

In [None]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input,LSTM,SimpleRNN
from tensorflow.keras.optimizers import SGD
import tensorflow as tf
tf.random.set_seed(0)
import matplotlib.pyplot as plt
model = Sequential()
model.add(SimpleRNN(128,activation="relu",input_shape = (1,3),return_sequences=True))
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='relu'))
model.compile(loss='mse', optimizer='adam',metrics=['mse'])
model.summary()
# Fit and test the model by randomly splitting it 
# 67% of the data for training and 33% of the data for validation


### We have fitted the model below with batch size as 128 , epoch as 50 and validation split as 0.33 which will split the train test data as 67% and 33% respectively.

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model)

In [None]:
model.fit(X, final_score, batch_size=128, epochs=50,validation_split=0.33)

### Here we have added a scatter plot of the original values and the predicted values

In [None]:
history_dict = model.history.history
loss_value = history_dict['loss']
val_loss_value = history_dict['val_loss']
epochs = range(1, len(loss_value) + 1)
plt.plot(epochs, loss_value, 'b', label='Training Loss')
plt.plot(epochs, val_loss_value, 'r', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
Y_pred = model.predict(X)

import matplotlib.pyplot as plt
plt.scatter(final_score,Y_pred)

### Here we have shown the predictions with the help of dummy data created by us.
### Here [210,19*6+6,0] stands for 210 = current score, 19 = overs bowled, 0= wickets fallen.
### So as we can se based on this values our final score predicted at the end of 20 overs would be 227 .
### Similary we have predicted for 2 other scenarios.

In [None]:
dummyData = [[210,19*6+6,0],[150,15*6+6,5],[130,12*6+6,9]]
#np.shape(dummyData)
#np.shape(X)
dummyData=np.array(dummyData)
dummyData = np.reshape(dummyData, (dummyData.shape[0], 1, dummyData.shape[1]))
pred = model.predict(dummyData)

print(pred)

### Simple RNN with Regularization

In [None]:
tf.random.set_seed(0)
import matplotlib.pyplot as plt
model1 = Sequential()
model1.add(SimpleRNN(128,activation="relu",input_shape = (1,3),return_sequences=True))
model1.add(Dropout(0.2))
model1.add(Dense(256, activation='relu'))
model1.add(Dense(1, activation='relu'))
model1.compile(loss='mse', optimizer='adam',metrics=['mse'])
model1.summary()
# Fit and test the model by randomly splitting it 
# 67% of the data for training and 33% of the data for validation

In [None]:

plot_model(model1)

In [None]:
history=model1.fit(X, final_score, batch_size=128, epochs=50,validation_split=0.33)

In [None]:
history_dict = model1.history.history
loss_value = history_dict['loss']
val_loss_value = history_dict['val_loss']
epochs = range(1, len(loss_value) + 1)
plt.plot(epochs, loss_value, 'b', label='Training Loss')
plt.plot(epochs, val_loss_value, 'r', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
Y_pred = model1.predict(X)

import matplotlib.pyplot as plt
plt.scatter(final_score,Y_pred)

### Here [210,19*6+6,0] stands for 210 = current score, 19 = overs bowled, 0= wickets fallen.
### Here we can see that with regularized input the final prediction is lower than unregularized

In [None]:
dummyData = [[210,19*6+6,0],[150,15*6+6,5],[130,12*6+6,9]]
#np.shape(dummyData)
#np.shape(X)
dummyData=np.array(dummyData)
dummyData = np.reshape(dummyData, (dummyData.shape[0], 1, dummyData.shape[1]))
pred = model1.predict(dummyData)

print(pred)

### USECASE 2 : Predicting Result of the Match

In [None]:
df = pd.read_csv('../input/cricket-world-cup-2019-player-analysis/ODI_Match_Results.csv')

### Removing Unwanted columns

In [None]:
df=df.drop(['Unnamed: 0','BR','Start Date'],axis=1)

In [None]:
df.head(50)

### Removing the records with no match results

In [None]:
df=df[~(df['Result'] == 'n/r')]
df=df[~(df['Result'] == 'aban')]
df=df[~(df['Result'] == 'tied')]
df=df[~(df['Result'] == 'canc')]
df=df[~(df['Result'] == '-')]

In [None]:
df.reset_index(inplace=True)
df=df.drop(['index'],axis=1)

### Cleaning the data 

In [None]:
df['Match_ID'] = df['Match_ID'].str.replace('ODI #', '')
df['Opposition'] = df['Opposition'].str.replace('v ', '')
df['Bat'] = df['Bat'].str.replace('st', '')
df['Bat'] = df['Bat'].str.replace('nd', '')

In [None]:
df.head(50)

#### Factorizing the home team and opossition in such a way that same label is encoded to the same team (India should be 0 in both opposition and home team)

In [None]:
f = pd.factorize(df[['Opposition','Country']].stack().drop_duplicates().sort_index(level=1))
s1 = pd.Series(f[0], index=f[1])
df=df.assign(**df.apply(lambda x: x.map(s1)).add_suffix('_ID'))

In [None]:
df=df.drop(['Result_ID','Margin_ID','Toss_ID','Bat_ID','Ground_ID','Match_ID_ID','Country_ID_ID'],axis=1)

### Factorizing other attributes of the data

In [None]:
ground_factorized,ground_categories = pd.factorize(df['Ground'])
toss_factorized,toss_categories = pd.factorize(df['Toss'])
result_factorized,result_categories = pd.factorize(df['Result'])

In [None]:
df['Ground'] = ground_factorized
df['Toss'] = toss_factorized
df['Result'] = result_factorized
arr = df['Country_ID'].to_numpy()
arr1=df['Opposition_ID'].to_numpy()
print(result_categories)

In [None]:
df=df.drop(['Margin','Country','Opposition'],axis=1)

In [None]:
df.head(100)

### Here we are taking toss , ground on which the match is played Home team and opposition. We have taking result as our output variable so we could classify if the team has won or lost the match

In [None]:
X = np.zeros((np.shape(arr)[0],4))
X[:,0] = toss_factorized
X[:,1] = ground_factorized
X[:,2] = arr
X[:,3] = arr1
print(X.shape)
print(result_factorized.shape)
print(type(X))
print(type(result_factorized))
print(X.shape)
print(X)
print(result_factorized)
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X=sc.fit_transform(X)
X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
print(X.shape)

### Here for the Sequential model we have 2 LSTM model and 2 dense layer were added further with last dense layer having signoid functiod as activator working as output layer (LSTM)

In [None]:
tf.random.set_seed(42)
model1 = Sequential()
model1.add(LSTM(128,activation="relu",input_shape = (1,4),return_sequences=True))
model1.add(LSTM(128, activation='relu'))
model1.add(Dense(128, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
model1.summary()

In [None]:
plot_model(model1)

In [None]:
history = model1.fit(X,result_factorized,epochs=50,validation_split=0.33)

### Here for the Sequential model we have 2 LSTM model with drop out set to 0.5 for regularization and reduce overfitting and 2 dense layer were added further with last dense layer having signoid functiod as activator working as output layer (Regularized LSTM)

In [None]:
history_dict = model1.history.history
loss_value = history_dict['loss']
val_loss_value = history_dict['val_loss']
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
epochs = range(1, len(loss_value) + 1)
plt.plot(epochs, loss_value, 'b', label='Training Loss')
plt.plot(epochs, val_loss_value, 'r', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.figure()

plt.plot(epochs, acc, 'b', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input,LSTM,SimpleRNN,GRU
from tensorflow.keras.optimizers import SGD
import tensorflow as tf
tf.random.set_seed(42)
import matplotlib.pyplot as plt
model = Sequential()
model.add(LSTM(128,activation="relu",input_shape = (1,4),return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()

### We have fitted the model below without batch size as the data was small , epoch as 50 and validation split as 0.33 which will split the train test data as 67% and 33% respectively

In [None]:
plot_model(model, to_file="model.png")

In [None]:
history = model.fit(X,result_factorized,epochs=50,validation_split=0.33)

### Below we have shown training and validation Loss as well as Acurracy graph

In [None]:
history_dict = model.history.history
loss_value = history_dict['loss']
val_loss_value = history_dict['val_loss']
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
epochs = range(1, len(loss_value) + 1)
plt.plot(epochs, loss_value, 'b', label='Training Loss')
plt.plot(epochs, val_loss_value, 'r', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.figure()

plt.plot(epochs, acc, 'b', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

### We can see the result classified as won and loss below

In [None]:
Y_pred = model.predict_classes(X)
Y_pred

### Taking user input for The two teams home and opposition as well as the team which won the toss and on what ground the match is played and further predicting that the home team would win or lose

In [None]:
# toss
while True:
  try:
    t= int(input("Input toss result 0 or 1: "))
    if t==1 or t==0:
      print("toss value entered successfully...")
      break;
    else:
      print("toss value should be either 0 or 1")      
  except ValueError:
    print("Provide an appropriate integer value...")
    continue

#ground   
while True:
  try:
    g= int(input("Input ground code (0-96): "))
    if g>=0 and g<=96:
      print("ground code entered successfully...")
      break;
    else:
      print("ground code should be between 0-96")      
  except ValueError:
    print("Provide an appropriate integer value...")
    continue

#country
while True:
  try:
    c= int(input("Input home country code(0-16): "))
    if c>=0 or c<=16:
      print("home country code entered successfully...")
      break;
    else:
      print("country code value should be between 0 and 16")      
  except ValueError:
    print("Provide an appropriate integer value...")
    continue
    
    
#opposition

while True:
  try:
    o= int(input("Input opposition country code(0-16): "))
    if o>=0 or c<=16:
      print("opposition country code entered successfully...")
      break;
    else:
      print("opposition code value should be between 0 and 16")      
  except ValueError:
    print("Provide an appropriate integer value...")
    continue

****Here we have taken toss would be won as it has set to be 1 ground to be Kolkata Country to be India Opposition to be Pakistan .The match has been predicted to be won by India.****

In [None]:
dummyData = [[t,g,c,o]]
#np.shape(dummyData)
#np.shape(X)
dummyData=np.array(dummyData)
dummyData = np.reshape(dummyData, (dummyData.shape[0], 1, dummyData.shape[1]))
pred = model.predict_classes(dummyData)

print(pred)