# Identifying Turning Points in Cricket Matches Through In-Game Data Patterns

Working on the dataset:

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Checking our folder
folder_path = "t20s_male"
files = os.listdir(folder_path)
print("Total files:", len(files))
print("First 5 files:", files[:5])

Total files: 6264
First 5 files: ['1001349.csv', '1001349_info.csv', '1001351.csv', '1001351_info.csv', '1001353.csv']


In [3]:
# Checking for required files
match_files = [f for f in files if not f.endswith("_info.csv")]
print("Total match files:", len(match_files))
print("First 5 match files:", match_files[:5])

Total match files: 3132
First 5 match files: ['1001349.csv', '1001351.csv', '1001353.csv', '1004729.csv', '1007655.csv']


In [4]:
# Checking for columns in a random file
sample_file = match_files[0]
sample_path = os.path.join(folder_path, sample_file)
df_sample = pd.read_csv(sample_path)
print("Rows:", df_sample.shape[0])
print("Columns:", df_sample.shape[1])
df_sample.head()

Rows: 250
Columns: 22


Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.1,Australia,Sri Lanka,AJ Finch,M Klinger,...,0,,,,,,,,,
1,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.2,Australia,Sri Lanka,AJ Finch,M Klinger,...,0,,,,,,,,,
2,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.3,Australia,Sri Lanka,AJ Finch,M Klinger,...,0,,,,,,,,,
3,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.4,Australia,Sri Lanka,M Klinger,AJ Finch,...,0,,,,,,,,,
4,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.5,Australia,Sri Lanka,M Klinger,AJ Finch,...,0,,,,,,,,,


In [5]:
df_sample.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')

In [6]:
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   match_id                250 non-null    int64  
 1   season                  250 non-null    object 
 2   start_date              250 non-null    object 
 3   venue                   250 non-null    object 
 4   innings                 250 non-null    int64  
 5   ball                    250 non-null    float64
 6   batting_team            250 non-null    object 
 7   bowling_team            250 non-null    object 
 8   striker                 250 non-null    object 
 9   non_striker             250 non-null    object 
 10  bowler                  250 non-null    object 
 11  runs_off_bat            250 non-null    int64  
 12  extras                  250 non-null    int64  
 13  wides                   9 non-null      float64
 14  noballs                 1 non-null      fl

In [7]:
# Checking 5 sample files now
import random
sample_check_files = random.sample(match_files, 5)
for file in sample_check_files:
    df_temp = pd.read_csv(os.path.join(folder_path, file))
    print(file, "->", len(df_temp.columns))

1223953.csv -> 22
1321465.csv -> 22
355997.csv -> 22
1513091.csv -> 22
1423439.csv -> 22


In [8]:
# Checking for their columns
for file in sample_check_files:
    df_temp = pd.read_csv(os.path.join(folder_path, file))
    print(file)
    print(df_temp.columns.tolist())
    print("-----")

1223953.csv
['match_id', 'season', 'start_date', 'venue', 'innings', 'ball', 'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler', 'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes', 'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type', 'other_player_dismissed']
-----
1321465.csv
['match_id', 'season', 'start_date', 'venue', 'innings', 'ball', 'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler', 'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes', 'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type', 'other_player_dismissed']
-----
355997.csv
['match_id', 'season', 'start_date', 'venue', 'innings', 'ball', 'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler', 'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes', 'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type', 'other_player_dismissed']
-----
1513091.csv
['match_id', 'season', 'start_date', 'venue', 'innings', 'ball

In [9]:
# Count of required files
files = os.listdir(folder_path)
match_files = [f for f in files if f.endswith(".csv") and not f.endswith("_info.csv")]
print("Total match files:", len(match_files))

Total match files: 3132


In [10]:
# Merge match files until reaching approximately 50,000 rows
all_data = []
row_count = 0
target_rows = 50000

for file in match_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    
    all_data.append(df)
    row_count += len(df)
    
    if row_count >= target_rows:
        break

combined_df = pd.concat(all_data, ignore_index=True)

print("Final Shape:", combined_df.shape)

Final Shape: (50186, 22)


In [11]:
# Check missing values in dataset
combined_df.isnull().sum()

match_id                      0
season                        0
start_date                    0
venue                         0
innings                       0
ball                          0
batting_team                  0
bowling_team                  0
striker                       0
non_striker                   0
bowler                        0
runs_off_bat                  0
extras                        0
wides                     48537
noballs                   49991
byes                      50049
legbyes                   49413
penalty                   50186
wicket_type               47477
player_dismissed          47477
other_wicket_type         50186
other_player_dismissed    50186
dtype: int64

In [12]:
# Fill extra-related columns with 0 where null
extra_cols = ["wides", "noballs", "byes", "legbyes", "penalty"]
for col in extra_cols:
    combined_df[col] = combined_df[col].fillna(0)
combined_df[extra_cols].isnull().sum()

wides      0
noballs    0
byes       0
legbyes    0
penalty    0
dtype: int64

In [13]:
# Create binary wicket indicator column
combined_df["wicket"] = combined_df["wicket_type"].notna().astype(int)
combined_df[["wicket_type", "wicket"]].head(10)

Unnamed: 0,wicket_type,wicket
0,,0
1,,0
2,,0
3,,0
4,,0
5,,0
6,,0
7,,0
8,,0
9,,0


In [14]:
# Extract over number and ball number from ball column
combined_df["over_number"] = combined_df["ball"].astype(str).str.split(".").str[0].astype(int)
combined_df["ball_number"] = combined_df["ball"].astype(str).str.split(".").str[1].astype(int)
combined_df[["ball", "over_number", "ball_number"]].head()

Unnamed: 0,ball,over_number,ball_number
0,0.1,0,1
1,0.2,0,2
2,0.3,0,3
3,0.4,0,4
4,0.5,0,5


In [15]:
# Create continuous ball index inside innings
combined_df["ball_index"] = combined_df["over_number"] * 6 + combined_df["ball_number"]
combined_df[["ball", "ball_index"]].head()

Unnamed: 0,ball,ball_index
0,0.1,1
1,0.2,2
2,0.3,3
3,0.4,4
4,0.5,5


In [16]:
# Create total_runs column by adding runs_off_bat and extras
combined_df["total_runs"] = combined_df["runs_off_bat"] + combined_df["extras"]
combined_df[["runs_off_bat", "extras", "total_runs"]].head()

Unnamed: 0,runs_off_bat,extras,total_runs
0,0,0,0
1,0,0,0
2,1,0,1
3,2,0,2
4,0,0,0


In [17]:
print("total_runs" in combined_df.columns)

True


In [18]:
# Sort dataset by match, innings, and ball_index
combined_df = combined_df.sort_values(
    by=["match_id", "innings", "ball_index"]
).reset_index(drop=True)
combined_df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,wicket,over_number,ball_number,ball_index,total_runs
0,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.1,Australia,Sri Lanka,AJ Finch,M Klinger,...,0.0,,,,,0,0,1,1,0
1,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.2,Australia,Sri Lanka,AJ Finch,M Klinger,...,0.0,,,,,0,0,2,2,0
2,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.3,Australia,Sri Lanka,AJ Finch,M Klinger,...,0.0,,,,,0,0,3,3,1
3,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.4,Australia,Sri Lanka,M Klinger,AJ Finch,...,0.0,,,,,0,0,4,4,2
4,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.5,Australia,Sri Lanka,M Klinger,AJ Finch,...,0.0,,,,,0,0,5,5,0


In [19]:
# Create cumulative runs per match per innings
combined_df["cumulative_runs"] = combined_df.groupby(
    ["match_id", "innings"]
)["total_runs"].cumsum()
combined_df[[
    "match_id",
    "innings",
    "ball",
    "total_runs",
    "cumulative_runs"
]].head(10)

Unnamed: 0,match_id,innings,ball,total_runs,cumulative_runs
0,1001349,1,0.1,0,0
1,1001349,1,0.2,0,0
2,1001349,1,0.3,1,1
3,1001349,1,0.4,2,3
4,1001349,1,0.5,0,3
5,1001349,1,0.6,3,6
6,1001349,1,1.1,0,6
7,1001349,1,1.2,1,7
8,1001349,1,1.3,0,7
9,1001349,1,1.4,0,7


In [20]:
# Create cumulative wickets per match per innings
combined_df["cumulative_wickets"] = combined_df.groupby(
    ["match_id", "innings"]
)["wicket"].cumsum()
combined_df[[
    "match_id",
    "innings",
    "ball",
    "wicket",
    "cumulative_wickets"
]].head(15)

Unnamed: 0,match_id,innings,ball,wicket,cumulative_wickets
0,1001349,1,0.1,0,0
1,1001349,1,0.2,0,0
2,1001349,1,0.3,0,0
3,1001349,1,0.4,0,0
4,1001349,1,0.5,0,0
5,1001349,1,0.6,0,0
6,1001349,1,1.1,0,0
7,1001349,1,1.2,0,0
8,1001349,1,1.3,0,0
9,1001349,1,1.4,0,0


In [21]:
combined_df.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed', 'wicket', 'over_number', 'ball_number',
       'ball_index', 'total_runs', 'cumulative_runs', 'cumulative_wickets'],
      dtype='object')

In [22]:
combined_df = combined_df.sort_values(
    ["match_id", "innings", "ball"]
)

In [23]:
combined_df = combined_df.sort_values(
    ["match_id", "innings", "ball_index"]
).reset_index(drop=True)

In [24]:
combined_df["runs_last_6"] = combined_df.groupby(
    ["match_id", "innings"]
)["total_runs"].rolling(window=6, min_periods=1).sum().reset_index(level=[0,1], drop=True)

In [25]:
combined_df["wickets_last_6"] = combined_df.groupby(
    ["match_id", "innings"]
)["wicket"].rolling(window=6, min_periods=1).sum().reset_index(level=[0,1], drop=True)

In [26]:
combined_df[[
    "over_number",
    "ball_number",
    "total_runs",
    "runs_last_6",
    "wicket",
    "wickets_last_6"
]].head(15)


Unnamed: 0,over_number,ball_number,total_runs,runs_last_6,wicket,wickets_last_6
0,0,1,0,0.0,0,0.0
1,0,2,0,0.0,0,0.0
2,0,3,1,1.0,0,0.0
3,0,4,2,3.0,0,0.0
4,0,5,0,3.0,0,0.0
5,0,6,3,6.0,0,0.0
6,1,1,0,6.0,0,0.0
7,1,2,1,7.0,0,0.0
8,1,3,0,6.0,0,0.0
9,1,4,0,4.0,0,0.0


In [27]:
combined_df.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed', 'wicket', 'over_number', 'ball_number',
       'ball_index', 'total_runs', 'cumulative_runs', 'cumulative_wickets',
       'runs_last_6', 'wickets_last_6'],
      dtype='object')

In [28]:
combined_df["turning_point"] = (
    (combined_df["wickets_last_6"] >= 2) |
    (combined_df["runs_last_6"] >= 15)
).astype(int)

In [29]:
combined_df["turning_point"].value_counts(normalize=True)

turning_point
0    0.892819
1    0.107181
Name: proportion, dtype: float64

# Logistic Regression and Decision Tree

In [30]:
features = [
    "runs_last_6",
    "wickets_last_6",
    "over_number",
    "ball_number"
]
X = combined_df[features]
y = combined_df["turning_point"]

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [32]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

In [33]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

In [34]:
from sklearn.metrics import classification_report, accuracy_score
print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test, lr_pred))
print(classification_report(y_test, lr_pred))
print("\nDecision Tree")
print("Accuracy:", accuracy_score(y_test, dt_pred))
print(classification_report(y_test, dt_pred))

Logistic Regression
Accuracy: 0.93335325762104
              precision    recall  f1-score   support

           0       0.95      0.98      0.96      8962
           1       0.75      0.57      0.65      1076

    accuracy                           0.93     10038
   macro avg       0.85      0.77      0.81     10038
weighted avg       0.93      0.93      0.93     10038


Decision Tree
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8962
           1       1.00      1.00      1.00      1076

    accuracy                           1.00     10038
   macro avg       1.00      1.00      1.00     10038
weighted avg       1.00      1.00      1.00     10038



In [35]:
combined_df["balls_bowled"] = combined_df["ball_index"] + 1
combined_df["current_run_rate"] = (
    combined_df["cumulative_runs"] / (combined_df["balls_bowled"] / 6)
)
combined_df["wickets_in_hand"] = 10 - combined_df["cumulative_wickets"]

In [36]:
features = [
    "over_number",
    "ball_number",
    "cumulative_runs",
    "cumulative_wickets",
    "current_run_rate",
    "wickets_in_hand"
]

In [37]:
X = combined_df[features]
y = combined_df["turning_point"]

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [39]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced')
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

In [40]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42, class_weight='balanced')
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

In [41]:
from sklearn.metrics import classification_report, accuracy_score
print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test, lr_pred))
print(classification_report(y_test, lr_pred))
print("\nDecision Tree")
print("Accuracy:", accuracy_score(y_test, dt_pred))
print(classification_report(y_test, dt_pred))

Logistic Regression
Accuracy: 0.6699541741382745
              precision    recall  f1-score   support

           0       0.95      0.67      0.78      8962
           1       0.20      0.68      0.31      1076

    accuracy                           0.67     10038
   macro avg       0.57      0.67      0.54     10038
weighted avg       0.87      0.67      0.73     10038


Decision Tree
Accuracy: 0.6707511456465431
              precision    recall  f1-score   support

           0       0.95      0.67      0.78      8962
           1       0.20      0.71      0.32      1076

    accuracy                           0.67     10038
   macro avg       0.58      0.69      0.55     10038
weighted avg       0.87      0.67      0.73     10038



In [42]:
import pickle
pickle.dump(dt_model, open("model.pkl", "wb"))