In [None]:
# Data
# The dataset comes from  https://cricsheet.org/, and it is related to One Day Internation matches for all male players and it 
# is in the yaml format
# The classification goal is to analyse the data only for India and Australia and to predict who win the Coming ODI between
# Australia and India
# The dataset can be downloaded from here - "https://cricsheet.org/downloads/odis_male.zip".

In [None]:
import glob
import pandas as pd
import yaml

path =r'E:\Hackathron\India_VS_Australia\odis_male' # use your path
allFiles = glob.glob(path + "/*.yaml")

list_ = []

for file_ in allFiles:
    with open(file_, 'r') as stream:
        try:
            ODI = yaml.load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    if 'INDIA' in [x.strip().upper() for x in ODI['info']['teams']] and 'AUSTRALIA' in [x.strip().upper() for x in ODI['info']['teams']]:
        team = []
        batsman = []
        bowler = []
        batsman_runs = []
        extra_runs = []
        total_runs = []
        fielder = []
        outby = []
        player_out = []
        wicket = []
        sixes = []
        fours = []
        innings = []
        for i, j in enumerate(ODI['innings']):
            for k in ODI['innings'][i]:
                for l,m in enumerate(ODI['innings'][i][k]['deliveries']):
                    for n in list(ODI['innings'][i][k]['deliveries'][l].keys()):
                        innings.append(k)
                        team.append(ODI['innings'][i][k]['team'])
                        batsman.append(ODI['innings'][i][k]['deliveries'][l][n]['batsman'])
                        bowler.append(ODI['innings'][i][k]['deliveries'][l][n]['bowler'])
                        batsman_runs.append(ODI['innings'][i][k]['deliveries'][l][n]['runs']['batsman'])
                        extra_runs.append(ODI['innings'][i][k]['deliveries'][l][n]['runs']['extras'])
                        total_runs.append(ODI['innings'][i][k]['deliveries'][l][n]['runs']['total'])
                        try:
                            fielder.append(ODI['innings'][i][k]['deliveries'][l][n]['wicket']['fielders'][0])
                            outby.append(ODI['innings'][i][k]['deliveries'][l][n]['wicket']['kind'])
                            player_out.append(ODI['innings'][i][k]['deliveries'][l][n]['wicket']['player_out'])
                            wicket.append(1)
                        except:
                            fielder.append("None")
                            outby.append("None")
                            player_out.append("None")
                            wicket.append(0)
                        if ODI['innings'][i][k]['deliveries'][l][n]['runs']['batsman'] == 6:
                            sixes.append(1)
                        else:
                            sixes.append(0)
                        if ODI['innings'][i][k]['deliveries'][l][n]['runs']['batsman'] == 4:
                            fours.append(1)
                        else:
                            fours.append(0)
        ODI_International = {'Innings': innings,
                        'Team': team,
                        'Batsman': batsman,
                        'Bowler': bowler,
                        'Batsman_Runs': batsman_runs,
                        'Extra_Runs': extra_runs,
                        'Total_Runs': total_runs,
                        'Fielder': fielder,
                        'OutBy': outby,
                        'Player_Out': player_out,
                        'Wicket': wicket,
                        'Sixes': sixes,
                        'Fours': fours}
        data = pd.DataFrame.from_dict(ODI_International)
        df = data.groupby(['Innings','Team','Batsman','Bowler','OutBy','Fielder','Player_Out'],as_index=False).sum()
        try:
            df['City'] = ODI['info']['city']
        except:
            df['City'] = ODI['info']['venue'].split(" ")[0]
        df['Dates'] =  ODI['info']['dates'][0]
        try:
            df['Winner'] =  ODI['info']['outcome']['winner']
        except:
            df['Winner'] = "No Result"
        try:
            df['Man Of The Match'] = ODI['info']['player_of_match'][0]
        except:
            df['Man Of The Match'] = "No One"
        df['Toss_Decision'] = ODI['info']['toss']['decision']
        df['Toss_Winner'] =  ODI['info']['toss']['winner']
        df['Venue'] =  ODI['info']['venue']
        list_.append(df)
        
frame = pd.concat(list_, axis = 0, ignore_index = True)

In [None]:
frame.to_excel("India_Australia_ODI.xlsx", index = False)

In [None]:
# Now lets Import the necessary libraries
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [None]:
RawDataSets = pd.read_excel("India_Australia_ODI.xlsx")

print(RawDataSets.shape)
print(list(RawDataSets.columns))

In [None]:
# After converting the yaml data to dataframe we get 20 columns and 2859 rows, each row is the aggregation on categorical columns
# and it represend each bowl score which is aggregated in the data

In [None]:
# now lets check the data which we have extracted to get a hang of it

RawDataSets.head()

In [None]:
# Now lets discuss the input variuable, most of the variable are self explanatory, let discuss some
# Batsman, bowler, outby, fielder and player out are some of the columns that are aggregated
# For example - when batsman "D Mongia" is batting and he has faced bowler "GD McGrath" how many run he made, wicket and etc
# so basically it's a aggregation of all the bowls faced by "D Mongia" when "GD McGrath" is bowling

In [None]:
# Predict variable (desired target):
# Winner — It is a categorical variable which holds the value of team name who has win the match

In [None]:
# Data exploration
# we see that the data is still not in the correct format for analysis, since some of the columns are increasing the rowns

# so for our analysis purpose we will take the subset of data for answering the 1st questions winner of the series

ODI_Outcome = RawDataSets.loc[:,['Innings', 'Team', 'Batsman_Runs', 'Extra_Runs', 'Total_Runs', 'Wicket',
       'Sixes', 'Fours', 'City', 'Dates', 'Winner', 'Man Of The Match',
       'Toss_Decision', 'Toss_Winner', 'Venue']]

In [None]:
# for this we need to aggregate the data based on below columns

Aggre_ODI = ODI_Outcome.groupby(['Innings','Team','City','Dates','Winner','Man Of The Match','Toss_Decision','Toss_Winner','Venue'],as_index=False).sum()

Aggre_ODI.describe()

In [None]:
# now lets see the data which we have aggregated above

Aggre_ODI = Aggre_ODI.sort_values(by=['Dates','Innings'])

In [None]:
Aggre_ODI.head()

In [None]:
# now to analyse each ODI matched we don't recquired bowl level information, so we will create a new data wits with only below columns:

# we need to create a new column for innings and team

Aggre_ODI["Team_Innings"] = Aggre_ODI["Innings"] +"_"+ Aggre_ODI["Team"]

ODI_Analysis = pd.get_dummies(Aggre_ODI[['Team_Innings']])

In [None]:
# Now we have data in the format where if we aggregate we don't lose the innings information

ODI_Analysis.head()

In [None]:
# now lets join the data with the original data and lets see if every thing is coming accurate

ODI_Join = Aggre_ODI.join(ODI_Analysis)

ODI_Join.head()

In [None]:
# now lets remove the Innings , Team and Team_Innings column and then aggregate the data

ODI_Final = ODI_Join.drop(columns=['Innings','Team','Team_Innings','Batsman_Runs','Extra_Runs','Total_Runs','Wicket','Sixes','Fours'])

In [None]:
# Now lets aggreagte the data and see the final data

Final_ODI = ODI_Final.groupby(['City','Dates','Winner','Man Of The Match','Toss_Decision','Toss_Winner','Venue'],as_index=False).sum()

print(Final_ODI.shape)

Final_ODI.head()

In [None]:
# now we have 43 rows which is the exact number of matched played by india and australia during 2006 to 2016
# Now lets explore the data

# lets visualize the dependent variable

sns.countplot(x='Winner',data=Final_ODI, palette='hls')
plt.show()

In [None]:
# Now we will see who has won most of the times
pd.DataFrame(Final_ODI.Winner.value_counts(normalize=True))

In [None]:
# We see that Our dependent variable are imbalanced, and the ratio of India winning to Australia is 32:53.

# Before we go ahead to balance the dependent variable, let’s do some more exploration.

# Now we will see how many times matches were played between india and australia in which city
pd.DataFrame(Final_ODI.City.value_counts())

In [None]:
# now we will see what is the impact of toss decison on winning

Winner_Toss = Final_ODI.groupby("Toss_Decision").Winner.value_counts(normalize=True)

pd.DataFrame(Winner_Toss)

In [None]:
# now we will visualise the able table using crossmatrix

Winner_Toss.unstack()

#  we see that when toss decision is bat the team wins most of the times

In [None]:
# now lets analyse the venue odi played between india and australia

pd.DataFrame(Final_ODI.groupby("City").Venue.value_counts())

In [None]:
# we see from the above venue, that nagpur has two cricket stadium with some what same name,
# it may be because of the data discripency
# But when we verified the data with the actual stadium we know that there are two stadium with this name, so the data is accurate

In [None]:
# Now lets see in which city india has won most of the time

pd.crosstab(Final_ODI.City,Final_ODI.Winner).plot(kind='bar',figsize=(14,4))
plt.title('Winning Frequency In City')
plt.xlabel('City')
plt.ylabel('Frequency of Winning')
plt.show()

In [None]:
# we see that the frequescy of winning in city can be a good predictor
# we see that in nagpur india has won most of the time and australia in chandigarh for india cities

# now lets see the same for venue

pd.crosstab(Final_ODI.Venue,Final_ODI.Winner).plot(kind='bar',figsize=(14,4))
plt.title('Winning Frequency In Venue')
plt.xlabel('Venue')
plt.ylabel('Frequency of Winning')
plt.show()

In [None]:
# Now lets create additional column quarter from date column

Final_ODI["Quarter"] = Final_ODI["Dates"].apply(lambda x: x.quarter)

Final_ODI.sort_values(by=['Dates']).head()

In [None]:
# now lets see in which quarter india has won most of the time

Winner_Quarter = Final_ODI.groupby("Quarter").Winner.value_counts()

Winner_Quarter.unstack()

# We see that australia has won equal number of time in 1st and 4th quarter, and India has won most in 4th quarter

In [None]:
# Now lets model the the algorithm to predict who will win the series

# So the first step is to convert the data into machine format

# first we need to remove the data column

ODI_Machine = Final_ODI.drop(columns='Dates')

ODI_Machine.head()

In [None]:
# now we need to create the dependent variable with value 0 and 1 for creating ML model

# First we will remove the data with no result

ODI_Machine = ODI_Machine.loc[ODI_Machine['Winner'] != 'No Result']

print(ODI_Machine.shape)

In [None]:
# Now we will create a new column Prediction with values 0 and 1.
# 0 will represend India and 1 will Represend Australia

ODI_Machine["Winner_Y"] = ODI_Machine['Winner'].apply(lambda x: 1 if x == 'Australia' else 0)

ODI_Machine.loc[:,["Winner","Winner_Y"]]

In [None]:
# Now we will remove the winner column

Final_Data = ODI_Machine.drop(columns=['Man Of The Match','Winner','Winner_Y','Team_Innings_1st innings_Australia','Team_Innings_2nd innings_Australia'])

Final_Data.head()

In [None]:
# Create dummy variables
# That is variables with only two values, zero and one.

Machine_Data = pd.get_dummies(Final_Data, drop_first=True)

print(Machine_Data.shape)
print(list(Machine_Data.columns))

In [None]:
# now lets join the dependend variable to the machine data

Input_File = Machine_Data.join(ODI_Machine.Winner_Y)

Input_File.head()

In [None]:
# Over-sampling using SMOTE
# With our training data created, I’ll up-sample the India winning using the SMOTE algorithm(Synthetic Minority Oversampling Technique). At a high level, SMOTE:
# Works by creating synthetic samples from the minor class (India) instead of creating copies.
# Randomly choosing one of the k-nearest-neighbors and using it to create a similar, but randomly tweaked, new observations.
# We are going to implement SMOTE in Python.

In [None]:
X = Input_File.loc[:, Input_File.columns != 'Winner_Y']
y = Input_File.loc[:, Input_File.columns == 'Winner_Y']

In [None]:
# Importing the SMOTE library
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns

In [None]:
# # Feature Scaling
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

In [None]:
os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['Winner_Y'])

In [None]:
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of India winning in oversampled data",len(os_data_y[os_data_y['Winner_Y']==0]))
print("Number of Australia winning",len(os_data_y[os_data_y['Winner_Y']==1]))
print("Proportion of india winning data in oversampled data is ",len(os_data_y[os_data_y['Winner_Y']==0])/len(os_data_X))
print("Proportion of australia data in oversampled data is ",len(os_data_y[os_data_y['Winner_Y']==1])/len(os_data_X))

In [None]:
# Now we have a perfect balanced data! we have over-sampled only on the training data,
# because by oversampling only on the training data, none of the information in the test data is being used to create
# synthetic observations, therefore, no information will bleed from test data into the model training.

In [None]:
# Recursive Feature Elimination
# Recursive Feature Elimination (RFE) is based on the idea to repeatedly construct a model and choose either the best or
# worst performing feature, setting the feature aside and then repeating the process with the rest of the features. 
# This process is applied until all features in the dataset are exhausted. The goal of RFE is to select features by recursively
# considering smaller and smaller sets of features.

In [None]:
data_final_vars=Input_File.columns.values.tolist()
y=['Winner_Y']
X=[i for i in data_final_vars if i not in y]
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
rfe = RFE(logreg, 20)
rfe = rfe.fit(os_data_X, os_data_y.values.ravel())
print(rfe.support_)
print(rfe.ranking_)

In [None]:
# now we have get the final column for creating our model

Column_Data = {'Col_Name': data_final_vars[:-1],
              'Boolen_Val': rfe.support_}

In [None]:
# Now we need to convert the above into data frame so as to get the final columns

Column_File = pd.DataFrame.from_dict(Column_Data)

In [None]:
# The RFE has helped us select the following features:

Column_File['Col_Name'][Column_File['Boolen_Val'] == True].values

In [None]:
cols=list(Column_File['Col_Name'][Column_File['Boolen_Val'] == True].values)
X=os_data_X[cols]
y=os_data_y['Winner_Y']

In [None]:
# Now lets implement the model
# Logistic Regression Model Fitting

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
# Predicting the test set results and calculating the accuracy

y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
# Confusion Matrix

from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [None]:
# The result is telling us that we have 4+3 correct predictions and 2+0 incorrect predictions.

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# save the model to disk
from sklearn.externals import joblib
filename = 'finalized_model.sav'
joblib.dump(logreg, filename)

In [None]:
# load the model from disk
loaded_model = joblib.load(filename)
Odi_test = pd.read_excel("ODI_Test.xlsx")
result = loaded_model.predict(Odi_test)
print(result)

In [None]:
# So the above result shows that Australia will win the series with 4 out of 5 win

In [None]:
# Now let us analyse who will be the Highest run scorer for the series

# for this information we will be requiring the bowling and over data from the main datasets

Batsman_MedianRun = RawDataSets.groupby(['Dates','Batsman'],as_index=False).Batsman_Runs.median()
Batsman_Mean = RawDataSets.groupby(['Dates','Batsman'],as_index=False).Batsman_Runs.mean()
Batsman_TotalRun = RawDataSets.groupby(['Dates','Batsman'],as_index=False).Batsman_Runs.sum()

Batsman_MedianRun.sort_values(by=['Batsman_Runs'], ascending=False).head()

In [None]:
Batsman_TotalRun.sort_values(by=['Batsman_Runs'], ascending=False).head()

In [None]:
Batsman_Mean.sort_values(by=['Batsman_Runs'], ascending=False).head()

In [None]:
# We can see that in all the case Rohit Sharma is the highest run scorer, with a consistant average of 28 runs

# Next we have predict who will be the Highest wicket-taker

# Bowler_MedianWicket = RawDataSets.groupby(['Dates','Bowler'],as_index=False).Wicket.median()
# Bowler_MeanWicket = RawDataSets.groupby(['Dates','Bowler'],as_index=False).Wicket.mean()
Bowler_TotalWicket = RawDataSets.groupby(['Dates','Bowler'],as_index=False).Wicket.sum()

Bowler_TotalWicket.sort_values(by=['Wicket'], ascending=False).head()

In [None]:
# We see that there is a tie between the highest wicket taker, but since Ben is not playing this ODI, 
# we will go with Kane William Richardson for the highest wicket tacker

In [None]:
# Now let us analyse who will hit the maximun sixes in this odi

Batsman_TotalSixes = RawDataSets.groupby(['Dates','Batsman'],as_index=False).Sixes.sum()

Batsman_TotalSixes.sort_values(by=['Sixes'], ascending=False).head()

In [None]:
Batsman_MedianSixes = RawDataSets.groupby(['Dates','Batsman'],as_index=False).Sixes.median()

Batsman_MedianSixes.sort_values(by=['Sixes'], ascending=False).head()

In [None]:
# From the above analysis we can safely say that Rohit Sharma will hit the maximum numer of sixes

In [None]:
# Now lets see who will hit the maximun number of 4's in the series

Batsman_TotalFour = RawDataSets.groupby(['Dates','Batsman'],as_index=False).Fours.sum()

Batsman_TotalFour.sort_values(by=['Fours'], ascending=False).head()

In [None]:
Batsman_MedianFour = RawDataSets.groupby(['Dates','Batsman'],as_index=False).Fours.median()

Batsman_MedianFour.sort_values(by=['Fours'], ascending=False).head()

In [None]:
Batsman_MeanFour = RawDataSets.groupby(['Dates','Batsman'],as_index=False).Fours.mean()

Batsman_MeanFour.sort_values(by=['Fours'], ascending=False).head()

In [None]:
# From the above analysis for the number of 4's hit by any batsman, we find that though "SR Tendulkar" has the highest number
# of sixes against Australia, but since he is retired we will not consider him in our result
# The next second highest number of 4 hit by Virat Kholi, but we find that he is an outlier in this case, since his
# Distribution of 4's runs is skewed to the right as the mean is greater than the median.
# Finally the 3rd highets 4's hitter is Rohit Sharma  with a mean of 2.83 and median of 3 which tells that his distribution 
# is some what normal and we can safely bet on him for hitting the highest number of 4's in the series