In [2]:
import numpy as np
import pandas as pd
import pickle
import json


In [3]:
df = pickle.load(open('dataset_lvl2.pkl', 'rb'))
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
0,2,Australia,Sri Lanka,0.1,0,0,,Melbourne Cricket Ground
1,2,Australia,Sri Lanka,0.2,0,0,,Melbourne Cricket Ground
2,2,Australia,Sri Lanka,0.3,1,0,,Melbourne Cricket Ground
3,2,Australia,Sri Lanka,0.4,2,0,,Melbourne Cricket Ground
4,2,Australia,Sri Lanka,0.5,0,0,,Melbourne Cricket Ground
...,...,...,...,...,...,...,...,...
173713,1433,Sri Lanka,Australia,19.3,1,0,Colombo,R Premadasa Stadium
173714,1433,Sri Lanka,Australia,19.4,0,0,Colombo,R Premadasa Stadium
173715,1433,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,R Premadasa Stadium
173716,1433,Sri Lanka,Australia,19.6,2,0,Colombo,R Premadasa Stadium


Building features
batting_team, bowling_team, city, current score, ball_left, wickets_left, current_rr, last_five_overs


In [4]:
df.isnull().sum()

match_id                0
batting_team            0
bowling_team            0
ball                    0
runs                    0
player_dismissed        0
city                21126
venue                   0
dtype: int64

In [5]:
df[df['city'].isnull()]['venue'].value_counts()

venue
Dubai International Cricket Stadium        4659
Sylhet Stadium                             3194
Harare Sports Club                         2852
Sharjah Cricket Stadium                    2111
Pallekele International Cricket Stadium    2066
Melbourne Cricket Ground                   1933
Galle International Stadium                1419
Sydney Cricket Ground                       871
Adelaide Oval                               621
Guanggong International Cricket Stadium     483
Rawalpindi Cricket Stadium                  368
Colombo Cricket Club Ground                 235
Sylhet International Cricket Stadium        128
Arundel Castle Cricket Club Ground          122
Carrara Oval                                 64
Name: count, dtype: int64

In [6]:
# removing null values

if df['city'].isnull().any():
    df['city'] = df['city'].fillna(df['venue'].apply(lambda x: x.split()[0]))
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,Melbourne Cricket Ground
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,Melbourne Cricket Ground
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,Melbourne Cricket Ground
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,Melbourne Cricket Ground
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,Melbourne Cricket Ground
...,...,...,...,...,...,...,...,...
173713,1433,Sri Lanka,Australia,19.3,1,0,Colombo,R Premadasa Stadium
173714,1433,Sri Lanka,Australia,19.4,0,0,Colombo,R Premadasa Stadium
173715,1433,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,R Premadasa Stadium
173716,1433,Sri Lanka,Australia,19.6,2,0,Colombo,R Premadasa Stadium


In [7]:
df.isnull().sum()

match_id            0
batting_team        0
bowling_team        0
ball                0
runs                0
player_dismissed    0
city                0
venue               0
dtype: int64

In [8]:
df.drop('venue', axis=1, inplace=True)
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne
...,...,...,...,...,...,...,...
173713,1433,Sri Lanka,Australia,19.3,1,0,Colombo
173714,1433,Sri Lanka,Australia,19.4,0,0,Colombo
173715,1433,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo
173716,1433,Sri Lanka,Australia,19.6,2,0,Colombo


In [9]:
# calculating current score according to match_id

df['runs'] = pd.to_numeric(df['runs'], errors='coerce')
df['current_score'] = df.groupby('match_id')['runs'].cumsum()

In [10]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3
...,...,...,...,...,...,...,...,...
173713,1433,Sri Lanka,Australia,19.3,1,0,Colombo,125
173714,1433,Sri Lanka,Australia,19.4,0,0,Colombo,125
173715,1433,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,125
173716,1433,Sri Lanka,Australia,19.6,2,0,Colombo,127


In [11]:
# calculating balls left
# Extract over and ball number in one step
df[['over', 'ball_no']] = df['ball'].astype(str).str.split(".", expand=True)

# Calculate balls bowled
df['balls_bowled'] = df['over'].astype(int) * 6 + df['ball_no'].astype(int)

# Calculate balls left
df['balls_left'] = df['balls_bowled'].apply(lambda x: 0 if 120 - x < 0 else 120 - x)
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1,119
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2,118
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3,117
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4,116
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5,115
...,...,...,...,...,...,...,...,...,...,...,...,...
173713,1433,Sri Lanka,Australia,19.3,1,0,Colombo,125,19,3,117,3
173714,1433,Sri Lanka,Australia,19.4,0,0,Colombo,125,19,4,118,2
173715,1433,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,125,19,5,119,1
173716,1433,Sri Lanka,Australia,19.6,2,0,Colombo,127,19,6,120,0


In [12]:
# Convert 'player_dismissed' to 1 if not '0', otherwise 0 and confirm it integer
df['player_dismissed'] = df['player_dismissed'].apply(lambda x: 0 if x == '0' else 1).astype(int)

# Calculate cumulative dismissals within each 'match_id'
df['player_dismissed'] = df.groupby('match_id')['player_dismissed'].cumsum()

# Now you can calculate the wickets left if needed
df['wickets_left'] = 10 - df['player_dismissed']
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left,wickets_left
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1,119,10
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2,118,10
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3,117,10
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4,116,10
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5,115,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173713,1433,Sri Lanka,Australia,19.3,1,8,Colombo,125,19,3,117,3,2
173714,1433,Sri Lanka,Australia,19.4,0,8,Colombo,125,19,4,118,2,2
173715,1433,Sri Lanka,Australia,19.5,0,9,Colombo,125,19,5,119,1,1
173716,1433,Sri Lanka,Australia,19.6,2,9,Colombo,127,19,6,120,0,1


In [13]:
# current run rate
df['current_rr'] = (df['current_score']*6)/df['balls_bowled']
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left,wickets_left,current_rr
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1,119,10,0.000000
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2,118,10,0.000000
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3,117,10,2.000000
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4,116,10,4.500000
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5,115,10,3.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173713,1433,Sri Lanka,Australia,19.3,1,8,Colombo,125,19,3,117,3,2,6.410256
173714,1433,Sri Lanka,Australia,19.4,0,8,Colombo,125,19,4,118,2,2,6.355932
173715,1433,Sri Lanka,Australia,19.5,0,9,Colombo,125,19,5,119,1,1,6.302521
173716,1433,Sri Lanka,Australia,19.6,2,9,Colombo,127,19,6,120,0,1,6.350000


In [14]:
# Calculate the rolling sum of runs over the last 30 balls for each match
df['last_five'] = df.groupby('match_id')['runs'].rolling(window=30).sum().reset_index(drop=True).tolist()
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left,wickets_left,current_rr,last_five
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1,119,10,0.000000,
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2,118,10,0.000000,
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3,117,10,2.000000,
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4,116,10,4.500000,
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5,115,10,3.600000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173713,1433,Sri Lanka,Australia,19.3,1,8,Colombo,125,19,3,117,3,2,6.410256,32.0
173714,1433,Sri Lanka,Australia,19.4,0,8,Colombo,125,19,4,118,2,2,6.355932,32.0
173715,1433,Sri Lanka,Australia,19.5,0,9,Colombo,125,19,5,119,1,1,6.302521,32.0
173716,1433,Sri Lanka,Australia,19.6,2,9,Colombo,127,19,6,120,0,1,6.350000,33.0


In [15]:
# calculate total sum in one match
final_df = df.groupby('match_id').sum()['runs'].reset_index().merge(df,on='match_id')

In [16]:
final_df = final_df[['batting_team','bowling_team','city','current_score','balls_left','wickets_left','current_rr','last_five','runs_x']]
final_df

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,current_rr,last_five,runs_x
0,Australia,Sri Lanka,Melbourne,0,119,10,0.000000,,168
1,Australia,Sri Lanka,Melbourne,0,118,10,0.000000,,168
2,Australia,Sri Lanka,Melbourne,1,117,10,2.000000,,168
3,Australia,Sri Lanka,Melbourne,3,116,10,4.500000,,168
4,Australia,Sri Lanka,Melbourne,3,115,10,3.600000,,168
...,...,...,...,...,...,...,...,...,...
128008,Sri Lanka,Australia,Colombo,125,3,2,6.410256,32.0,128
128009,Sri Lanka,Australia,Colombo,125,2,2,6.355932,32.0,128
128010,Sri Lanka,Australia,Colombo,125,1,1,6.302521,32.0,128
128011,Sri Lanka,Australia,Colombo,127,0,1,6.350000,33.0,128


In [17]:
final_df.dropna(inplace=True)

In [18]:
final_df.isnull().sum()

batting_team     0
bowling_team     0
city             0
current_score    0
balls_left       0
wickets_left     0
current_rr       0
last_five        0
runs_x           0
dtype: int64

In [19]:
# shuffle the values
final_df = final_df.sample(frac=1).reset_index(drop=True)
final_df

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,current_rr,last_five,runs_x
0,India,Australia,St Lucia,27,82,9,4.263158,21.0,119
1,New Zealand,Ireland,Chandigarh,64,75,9,8.533333,42.0,177
2,Australia,Bangladesh,Canberra,127,31,10,8.561798,33.0,189
3,New Zealand,Australia,Victoria,41,83,9,6.648649,32.0,101
4,Sri Lanka,Pakistan,Lahore,80,53,6,7.164179,36.0,147
...,...,...,...,...,...,...,...,...,...
97453,England,Pakistan,Kuala Lumpur,70,65,9,7.636364,44.0,154
97454,Ireland,Australia,Guyana,35,60,7,3.500000,10.0,93
97455,South Africa,New Zealand,Port Elizabeth,95,41,8,7.215190,45.0,179
97456,India,New Zealand,Delhi,139,32,10,9.477273,61.0,202


In [20]:
unique_cities = final_df['city'].unique().tolist()
playing_teams = final_df['batting_team'].unique().tolist()

In [21]:
cities_data = {"cities": unique_cities}
teams_data = {"playing_teams": playing_teams}

In [22]:
cities_file_path = r'C:\Users\soham\OneDrive\Desktop\Projects\Score predictor\Frontend\cities.json'
teams_file_path = r'C:\Users\soham\OneDrive\Desktop\Projects\Score predictor\Frontend\playing_teams.json'

# Save to JSON files
with open(cities_file_path, 'w') as cities_file:
    json.dump(cities_data, cities_file, indent=4)

with open(teams_file_path, 'w') as teams_file:
    json.dump(teams_data, teams_file, indent=4)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\soham\\OneDrive\\Desktop\\Projects\\Score predictor\\Frontend\\cities.json'

Train test split

In [23]:
X = final_df.drop(columns=['runs_x'])
y = final_df['runs_x']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [24]:
X_train

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,current_rr,last_five
36956,Sri Lanka,Pakistan,Abu Dhabi,89,39,9,6.592593,36.0
47888,Sri Lanka,Pakistan,Abu Dhabi,34,90,8,6.800000,34.0
29665,Pakistan,Scotland,Durban,39,83,8,6.324324,30.0
75143,Australia,England,Melbourne,27,90,8,5.400000,27.0
93355,India,Bangladesh,Nottingham,90,51,9,7.826087,35.0
...,...,...,...,...,...,...,...,...
21440,Pakistan,South Africa,St Lucia,115,27,6,7.419355,58.0
73349,Sri Lanka,India,Mirpur,35,72,7,4.375000,21.0
50057,Netherlands,United Arab Emirates,Amstelveen,96,36,6,6.857143,33.0
5192,Netherlands,Ireland,Al Amarat,53,85,9,9.085714,42.0


In [25]:
X_test

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,current_rr,last_five
53293,New Zealand,Australia,Auckland,154,45,9,12.320000,73.0
45776,Sri Lanka,South Africa,Cape Town,28,78,9,4.000000,20.0
79520,Pakistan,England,Cardiff,169,1,4,8.521008,37.0
56565,Zimbabwe,Afghanistan,Bulawayo,100,47,10,8.219178,36.0
40309,England,Pakistan,Loughborough,99,35,8,6.988235,26.0
...,...,...,...,...,...,...,...,...
17785,Pakistan,Netherlands,London,109,37,7,7.879518,34.0
48670,Netherlands,United Arab Emirates,Amstelveen,52,74,8,6.782609,33.0
33697,India,Pakistan,Delhi,63,26,6,4.021277,30.0
32840,Bangladesh,United Arab Emirates,Mirpur,102,22,6,6.244898,25.0


In [26]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

# Column transformer to handle categorical data
trf = ColumnTransformer([
    ('trf', OneHotEncoder(sparse_output=False, drop='first'), ['batting_team', 'bowling_team', 'city'])
], remainder='passthrough')

# Pipeline with RandomForestRegressor
pipe = Pipeline(steps=[
    ('step1', trf),
    ('step2', StandardScaler()),
    ('step3', RandomForestRegressor(n_estimators=100, random_state=1))
])

# Fit the pipeline on the training data
pipe.fit(X_train, y_train)

# Predict on the test data
y_pred = pipe.predict(X_test)

# Print evaluation metrics
print(f"R-squared: {r2_score(y_test, y_pred)}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred)}")


R-squared: 0.9781594480929451
Mean Absolute Error: 2.5199088392796067


In [27]:
pickle.dump(pipe,open('model.pkl','wb'))

In [29]:
pipe = pickle.load(open('model.pkl', 'rb'))


In [50]:
batting_team = 'India'
bowling_team = 'England'
city = 'Nagpur'
current_score = 108
overs = 15
wickets_left = 7
last_five = 36  

In [51]:
over, ball_no = divmod(overs * 10, 10)
balls_bowled = int(over) * 6 + int(ball_no)
balls_left = 120 - balls_bowled

In [52]:
current_rr = (current_score * 6) / balls_bowled

In [53]:
# Create a DataFrame with the input
input_df = pd.DataFrame({
    'batting_team': [batting_team],
    'bowling_team': [bowling_team],
    'city': [city],
    'current_score': [current_score],
    'balls_left': [balls_left],
    'wickets_left': [wickets_left],
    'current_rr': [current_rr],
    'last_five': [last_five]
})

# Use the model to predict
predicted_runs = pipe.predict(input_df)

# Output the prediction
print(f"Predicted final score: {predicted_runs[0]:.2f}")

Predicted final score: 144.63
