In [1]:
import pandas as pd
import os

def get_concatenated_innings_data(tour_type="all", inning_type="batting", country=None):
    data = pd.read_csv("data/australia-test-summary-1990-2020-" + tour_type + ".csv")
    ids = list(data["Scorecard Link"].apply(lambda x: x.split("/")[-1].split(".")[0]))

    innings = []
    for match in os.listdir("matches"):
        if True in [True if str(id) in match else False for id in ids]:
            for inning in os.listdir("matches/" + match):
                if inning_type in inning:
                    inning_data = pd.read_csv("matches/" + match + "/" + inning)
                    inning_data["Inning Num"] = [inning for i in range(len(inning_data))]
                    innings.append(inning_data)

    innings = pd.concat(innings, ignore_index=True).reset_index()
    innings["Year"] = innings["Date"].apply(lambda x: int(x.split(" ")[-1]))
    innings = innings.merge(innings.groupby(by=["Year", "Name"])["Name"].agg(["count"]).reset_index())
    innings = innings.rename(columns={"count":"Innings Count"})
    
    if inning_type == "bowling":
        innings = innings.drop(columns=["Bowler Profile", "index", "Overs"])
        if country:
            innings = innings.loc[innings["Country"] != "Australia"]
            innings = innings.drop(columns="Country")
    elif inning_type == "batting":
        innings = innings.drop(columns=["Batsman Profile", "index"])
        if country:
            innings = innings.loc[innings["Country"] == "Australia"]
            innings = innings.drop(columns="Country")

    return innings

In [113]:
def create_player_summary(innings_data):
    innings_data["Name"] = innings_data["Name"].apply(lambda name: name.split(" ")[0] + " " + name.split(" ")[1])
    
    first_years = innings_data.groupby(by="Name")["Year"].min()
    last_years = innings_data.groupby(by="Name")["Year"].max()
    span = (last_years - first_years).reset_index()

    innings_data = innings_data.drop(columns=["Year", "Date"])

    batting_innings_runs = innings_data.groupby(by=["Name"])["Runs"].sum().reset_index()
    batting_innings_counts = innings_data.groupby(by=["Name"])["Name"].agg(["count"]).reset_index()
    batting_innings_balls = innings_data.groupby(by=["Name"])["Balls"].sum().reset_index()
    hundreds = innings_data.loc[innings_data["Runs"] > 100].groupby(by=["Name"])["Runs"].agg(["count"]).reset_index()
    fifties = innings_data.loc[innings_data["Runs"] > 50].groupby(by=["Name"])["Runs"].agg(["count"]).reset_index()
    thirties = innings_data.loc[innings_data["Runs"] > 30].groupby(by=["Name"])["Runs"].agg(["count"]).reset_index()
    
    innings_summary = batting_innings_runs.merge(batting_innings_balls)
    innings_summary = innings_summary.merge(batting_innings_counts, on="Name")
    innings_summary = innings_summary.merge(span, on="Name")
    innings_summary = innings_summary.merge(hundreds, on="Name", how="outer")
    innings_summary = innings_summary.rename(columns={"count_x": "Innings Total", "Year":"Career Span", "count_y":"Hundreds"})
    innings_summary = innings_summary.merge(fifties, on="Name", how="outer")
    innings_summary = innings_summary.merge(thirties, on="Name", how="outer")
    innings_summary = innings_summary.rename(columns={"count_x":"Fifties", "count_y":"Thirties"})
    innings_summary = innings_summary.merge(last_years.reset_index(), on="Name")
    innings_summary = innings_summary.rename(columns={"Year":"Last Year"})

    innings_summary["Balls per Innings"] = innings_summary["Balls"]/innings_summary["Innings Total"]
    innings_summary["Runs per Innings"] = innings_summary["Runs"]/innings_summary["Innings Total"]
    innings_summary["Strike Rate"] = innings_summary["Runs"]/innings_summary["Balls"]

    innings_summary["Hundreds per Innings"] = innings_summary["Hundreds"]/innings_summary["Innings Total"]
    innings_summary["Fifties per Innings"] = innings_summary["Fifties"]/innings_summary["Innings Total"]
    innings_summary["Thirties per Innings"] = innings_summary["Thirties"]/innings_summary["Innings Total"]
    innings_summary["Conversion Rate"] = innings_summary["Hundreds"]/(innings_summary["Hundreds"] + innings_summary["Fifties"])

    innings_summary = innings_summary.fillna(0)

    return innings_summary

def create_yearly_player_summary(innings_data, innings_count):
    bowling_innings_runs = innings_data.groupby(by=["Year", "Name"])["Runs"].sum().reset_index()
    bowling_innings_counts = innings_data.groupby(by=["Year", "Name"])["Name"].agg(["count"]).reset_index()
    bowling_innings_balls = innings_data.groupby(by=["Year", "Name"])["Balls"].sum().reset_index()
    
    innings_summary = bowling_innings_runs.merge(bowling_innings_balls)
    innings_summary = innings_summary.merge(innings_count, on="Year")
    innings_summary = innings_summary.rename(columns={("Inning Num", "count"): "Innings Total"})

    innings_summary["Balls per Innings"] = innings_summary["Balls"]/innings_summary["Innings Total"]
    innings_summary["Runs per Innings"] = innings_summary["Runs"]/innings_summary["Innings Total"]
    innings_summary["Strike Rate"] = innings_summary["Runs"]/innings_summary["Balls"]

    return innings_summary

In [3]:
def get_innings_count(innings_data):
    innings_data = innings_data.copy()
    innings_data = innings_data[["Year", "Date", "Inning Num"]]
    innings_data = innings_data.drop_duplicates()
    innings_data = innings_data.groupby(by="Year")[["Date", "Inning Num"]].agg(["count"]).reset_index().copy()
    innings_data = innings_data.drop(columns=["Date"])
    innings_data = innings_data.rename(columns={"Inning Num count": "Yearly Innings Count"})
    
    return innings_data


In [114]:
all_batting_innings = get_concatenated_innings_data(tour_type="all", inning_type="batting",  country="Australia")
all_innings_count = get_innings_count(all_batting_innings)
all_summary = create_player_summary(all_batting_innings)
all_summary_normalised = all_summary.copy()
columns = all_summary_normalised.columns

all_summary_normalised[columns[1:]] = all_summary_normalised[columns[1:]] - all_summary_normalised[columns[1:]].min()
all_summary_normalised[columns[1:]] = all_summary_normalised[columns[1:]]/all_summary_normalised[columns[1:]].max()

print(all_summary_normalised)

Name      Runs     Balls  Innings Total  Career Span  Hundreds  \
0       A Symonds  0.109284  0.098867       0.139860     0.235294     0.050   
1     AB McDonald  0.007998  0.009351       0.017483     0.000000     0.000   
2         AC Agar  0.014576  0.015234       0.020979     0.235294     0.000   
3         AC Dale  0.000448  0.000659       0.006993     0.058824     0.000   
4    AC Gilchrist  0.416355  0.298182       0.475524     0.529412     0.425   
..            ...       ...       ...            ...          ...       ...   
125       TM Head  0.080804  0.092984       0.090909     0.058824     0.050   
126      TM Moody  0.018538  0.024366       0.027972     0.117647     0.025   
127    UT Khawaja  0.215802  0.250549       0.269231     0.470588     0.200   
128   WN Phillips  0.001644  0.003380       0.003497     0.000000     0.000   
129    XJ Doherty  0.003812  0.006805       0.020979     0.176471     0.000   

      Fifties  Thirties  Last Year  Balls per Innings  Runs per 

In [61]:
from sklearn.decomposition import PCA
import plotly.subplots as subplots
import plotly.graph_objects as go

pca = PCA(n_components=4)
pca_components = pca.fit_transform(all_summary_normalised[["Runs", "Balls", "Balls per Innings", "Runs per Innings", "Strike Rate"]])
pca_components_df = pd.DataFrame(data = pca_components, columns = ['PC1', 'PC2', 'PC3', 'PC4'])

fig = subplots.make_subplots(rows=3, cols=3)
fig.add_trace(go.Scatter(x=pca_components_df["PC1"], y=pca_components_df["PC2"], mode="markers"), row=1, col=1)
fig.add_trace(go.Scatter(x=pca_components_df["PC1"], y=pca_components_df["PC3"], mode="markers"), row=1, col=2)
fig.add_trace(go.Scatter(x=pca_components_df["PC1"], y=pca_components_df["PC4"], mode="markers"), row=1, col=3)
fig.add_trace(go.Scatter(x=pca_components_df["PC2"], y=pca_components_df["PC3"], mode="markers"), row=2, col=2)
fig.add_trace(go.Scatter(x=pca_components_df["PC2"], y=pca_components_df["PC4"], mode="markers"), row=2, col=3)
fig.add_trace(go.Scatter(x=pca_components_df["PC3"], y=pca_components_df["PC4"], mode="markers"), row=3, col=3)
fig.show()


In [66]:
from sklearn import mixture
import plotly.subplots as subplots
import plotly.express as px
 
gmm = mixture.BayesianGaussianMixture(n_components=6, covariance_type='full').fit(pca_components)
gmm = gmm.predict(pca_components)

all_summary["Class"] = gmm
fig = px.scatter(all_summary, x="Strike Rate", y="Runs per Innings", color="Class", hover_name="Name")
fig.show()

fig = px.scatter(all_summary, x="Balls per Innings", y="Runs per Innings", color="Class", hover_name="Name")
fig.show()

fig = px.scatter(all_summary, x="Balls", y="Runs", color="Class", hover_name="Name")
fig.show()

In [63]:
from sklearn.decomposition import PCA
import plotly.subplots as subplots
import plotly.graph_objects as go

pca = PCA(n_components=4)
pca_components = pca.fit_transform(all_summary_normalised[["Career Span", "Runs", "Balls", "Balls per Innings", "Runs per Innings", "Strike Rate", "Innings Total"]])
pca_components_df = pd.DataFrame(data = pca_components, columns = ['PC1', 'PC2', 'PC3', 'PC4'])

fig = subplots.make_subplots(rows=3, cols=3)
fig.add_trace(go.Scatter(x=pca_components_df["PC1"], y=pca_components_df["PC2"], mode="markers"), row=1, col=1)
fig.add_trace(go.Scatter(x=pca_components_df["PC1"], y=pca_components_df["PC3"], mode="markers"), row=1, col=2)
fig.add_trace(go.Scatter(x=pca_components_df["PC1"], y=pca_components_df["PC4"], mode="markers"), row=1, col=3)
fig.add_trace(go.Scatter(x=pca_components_df["PC2"], y=pca_components_df["PC3"], mode="markers"), row=2, col=2)
fig.add_trace(go.Scatter(x=pca_components_df["PC2"], y=pca_components_df["PC4"], mode="markers"), row=2, col=3)
fig.add_trace(go.Scatter(x=pca_components_df["PC3"], y=pca_components_df["PC4"], mode="markers"), row=3, col=3)
fig.show()

In [64]:
from sklearn import mixture
import plotly.subplots as subplots
import plotly.express as px
 
gmm = mixture.BayesianGaussianMixture(n_components=5, covariance_type='full').fit(pca_components)
gmm = gmm.predict(pca_components)

all_summary["Class"] = gmm
fig = px.scatter(all_summary, x="Strike Rate", y="Runs per Innings", color="Class", hover_name="Name")
fig.show()

fig = px.scatter(all_summary, x="Balls per Innings", y="Runs per Innings", color="Class", hover_name="Name")
fig.show()

fig = px.scatter(all_summary, x="Balls", y="Runs", color="Class", hover_name="Name")
fig.show()

In [67]:
from sklearn import mixture
import plotly.subplots as subplots
import plotly.express as px
 
gmm = mixture.BayesianGaussianMixture(n_components=5, covariance_type='full').fit(all_summary_normalised[["Runs", "Balls per Innings", "Runs per Innings", "Strike Rate"]])
gmm = gmm.predict(all_summary_normalised[["Runs", "Balls per Innings", "Runs per Innings", "Strike Rate"]])

all_summary["Class"] = gmm
fig = px.scatter(all_summary, x="Strike Rate", y="Runs per Innings", color="Class", hover_name="Name")
fig.show()

fig = px.scatter(all_summary, x="Balls per Innings", y="Runs per Innings", color="Class", hover_name="Name")
fig.show()

fig = px.scatter(all_summary, x="Balls", y="Runs", color="Class", hover_name="Name")
fig.show()


In [127]:
from sklearn.decomposition import PCA
import plotly.subplots as subplots
import plotly.graph_objects as go

pca = PCA(n_components=2)
pca_components = pca.fit_transform(all_summary_normalised[["Balls per Innings", "Runs per Innings", "Strike Rate", "Hundreds per Innings", "Fifties per Innings", "Thirties per Innings", "Conversion Rate"]])
pca_components_df = pd.DataFrame(data = pca_components, columns = ['PC1', 'PC2'])

fig = subplots.make_subplots(rows=3, cols=3)
fig.add_trace(go.Scatter(x=pca_components_df["PC1"], y=pca_components_df["PC2"], mode="markers"), row=1, col=1)
'''fig.add_trace(go.Scatter(x=pca_components_df["PC1"], y=pca_components_df["PC3"], mode="markers"), row=1, col=2)
fig.add_trace(go.Scatter(x=pca_components_df["PC1"], y=pca_components_df["PC4"], mode="markers"), row=1, col=3)
fig.add_trace(go.Scatter(x=pca_components_df["PC2"], y=pca_components_df["PC3"], mode="markers"), row=2, col=2)
fig.add_trace(go.Scatter(x=pca_components_df["PC2"], y=pca_components_df["PC4"], mode="markers"), row=2, col=3)
fig.add_trace(go.Scatter(x=pca_components_df["PC3"], y=pca_components_df["PC4"], mode="markers"), row=3, col=3)'''
fig.show()

In [130]:
from sklearn import mixture
import plotly.subplots as subplots
import plotly.express as px
 
gmm = mixture.BayesianGaussianMixture(n_components=5, covariance_type='full').fit(all_summary_normalised[["Runs", "Balls", "Balls per Innings", "Runs per Innings", "Strike Rate", "Hundreds per Innings", "Fifties per Innings", "Thirties per Innings", "Conversion Rate"]])
gmm = gmm.predict(all_summary_normalised[["Runs", "Balls", "Balls per Innings", "Runs per Innings", "Strike Rate", "Hundreds per Innings", "Fifties per Innings", "Thirties per Innings", "Conversion Rate"]])

all_summary["Class"] = gmm
fig = px.scatter(all_summary, x="Strike Rate", y="Runs per Innings", color="Last Year", hover_name="Name")
fig.show()

fig = px.scatter(all_summary, x="Strike Rate", y="Runs per Innings", color="Class", hover_name="Name")
fig.show()

fig = px.scatter(all_summary, x="Balls per Innings", y="Runs per Innings", color="Class", hover_name="Name")
fig.show()

fig = px.scatter(all_summary, x="Balls", y="Runs", color="Class", hover_name="Name")
fig.show()
