In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

# Load Data
df = pd.read_csv("IPLStats2.csv")

# Preprocess - One-Hot Encoding
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(df[['Ground', 'Team A', 'Team B']]).toarray()
feature_names = encoder.get_feature_names_out(['Ground', 'Team A', 'Team B'])
X = pd.DataFrame(encoded_features, columns=feature_names)

# Target Variables
y_A = df['Six by A'] > 0  # Binary classification (Hit Six or Not)
y_B = df['Six by B'] > 0

# Train-Test Split
X_train, X_test, y_A_train, y_A_test = train_test_split(X, y_A, test_size=0.2, random_state=42)
X_train, X_test, y_B_train, y_B_test = train_test_split(X, y_B, test_size=0.2, random_state=42)

# Model Training
model_A = RandomForestClassifier(n_estimators=100, random_state=42)
model_B = RandomForestClassifier(n_estimators=100, random_state=42)
model_A.fit(X_train, y_A_train)
model_B.fit(X_train, y_B_train)

# Predictions
pred_A = model_A.predict_proba(X_test)[:,1]  # Probability of hitting a six
pred_B = model_B.predict_proba(X_test)[:,1]

# Evaluate Model
print("Accuracy for Team A:", accuracy_score(y_A_test, model_A.predict(X_test)))
print("Accuracy for Team B:", accuracy_score(y_B_test, model_B.predict(X_test)))


Accuracy for Team A: 0.5172413793103449
Accuracy for Team B: 0.6896551724137931


In [4]:
# Function to predict probability for new data
def predict_six_probability(ground, team_A, team_B):
    # Convert input into encoded form
    input_data = pd.DataFrame([[ground, team_A, team_B]], columns=['Ground', 'Team A', 'Team B'])
    input_encoded = encoder.transform(input_data).toarray()
    
    # Predict probabilities
    prob_A = model_A.predict_proba(input_encoded)[:, 1][0]
    prob_B = model_B.predict_proba(input_encoded)[:, 1][0]
    
    return prob_A, prob_B

# Custom Input
ground = "Vishakhapatnam"
team_A = "SRH"
team_B = "DC"

# Predict
prob_A, prob_B = predict_six_probability(ground, team_A, team_B)
print(f"Probability of {team_A} hitting sixes: {prob_A:.2f}")
print(f"Probability of {team_B} hitting sixes: {prob_B:.2f}")


Probability of GT hitting sixes: 0.03
Probability of MI hitting sixes: 0.05




In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv("IPLStats3.csv")  # Replace with actual dataset path

# Encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_features = encoder.fit_transform(df[['Ground', 'Team A', 'Team B']])

# Prepare target variables
target_six_A = df['Six by A'] > 0  # Convert to binary: 1 if A hit sixes, else 0
target_six_B = df['Six by B'] > 0  # Convert to binary: 1 if B hit sixes, else 0
target_win_A = df['Result'] == df['Team A']  # 1 if Team A wins, else 0

# Split into train and test
X_train, X_test, y_train_A, y_test_A = train_test_split(encoded_features, target_six_A, test_size=0.2, random_state=42)
X_train, X_test, y_train_B, y_test_B = train_test_split(encoded_features, target_six_B, test_size=0.2, random_state=42)
X_train, X_test, y_train_Win, y_test_Win = train_test_split(encoded_features, target_win_A, test_size=0.2, random_state=42)

# Train models
model_A = RandomForestClassifier(n_estimators=100, random_state=42)
model_A.fit(X_train, y_train_A)

model_B = RandomForestClassifier(n_estimators=100, random_state=42)
model_B.fit(X_train, y_train_B)

model_Win = RandomForestClassifier(n_estimators=100, random_state=42)
model_Win.fit(X_train, y_train_Win)

# Function to predict probabilities
def predict_probabilities(ground, team_A, team_B):
    input_data = pd.DataFrame([[ground, team_A, team_B]], columns=['Ground', 'Team A', 'Team B'])
    input_encoded = encoder.transform(input_data).toarray()
    
    prob_A_six = model_A.predict_proba(input_encoded)[:, 1][0]
    prob_B_six = model_B.predict_proba(input_encoded)[:, 1][0]
    prob_A_win = model_Win.predict_proba(input_encoded)[:, 1][0]  # Probability of Team A winning
    prob_B_win = 1 - prob_A_win  # Probability of Team B winning

    return prob_A_six, prob_B_six, prob_A_win, prob_B_win

# Custom Input
ground = "Vishakhapatnam"
team_A = "DC"
team_B = "SRH"

# Predict
prob_A_six, prob_B_six, prob_A_win, prob_B_win = predict_probabilities(ground, team_A, team_B)
print(f"Probability of {team_A} hitting sixes: {prob_A_six:.2f}")
print(f"Probability of {team_B} hitting sixes: {prob_B_six:.2f}")
print(f"Probability of {team_A} winning: {prob_A_win:.2f}")
print(f"Probability of {team_B} winning: {prob_B_win:.2f}")


Probability of DC hitting sixes: 0.13
Probability of SRH hitting sixes: 0.20
Probability of DC winning: 0.86
Probability of SRH winning: 0.14


In [15]:
import pandas as pd
import numpy as np

# Load dataset (skip irrelevant rows, keep header)
df = pd.read_csv("deliveries.csv", skiprows=range(1, 193618))

# Add 'is_four' and 'is_six' columns
df['is_four'] = (df['batsman_runs'] == 4).astype(int)
df['is_six'] = (df['batsman_runs'] == 6).astype(int)

# BATSMAN ANALYSIS
batsman_stats = df.groupby('batter').agg({
    'batsman_runs': ['sum', 'mean', 'max'],  # Total runs, Average runs, Highest score
    'ball': 'count',  # Balls faced
    'match_id': 'nunique',  # Matches played
    'is_four': 'sum',  # Total 4s
    'is_six': 'sum'  # Total 6s
}).reset_index()

batsman_stats.columns = ['batsman', 'total_runs', 'avg_runs', 'high_score', 'balls_faced', 'matches', 'fours', 'sixes']
batsman_stats['strike_rate'] = (batsman_stats['total_runs'] / batsman_stats['balls_faced']) * 100

# Save Stats
batsman_stats.to_csv("batsman_stats.csv", index=False)

# 🏏 Search for a specific batsman's stats
batsman_name = input("Enter Batsman Name: ").strip()

# Filter the stats for the entered batsman
player_data = batsman_stats[batsman_stats['batsman'].str.lower() == batsman_name.lower()]

if not player_data.empty:
    print("\n📊 Batsman Stats:")
    print(player_data.to_string(index=False))
else:
    print("\n❌ Batsman not found in the dataset!")


Enter Batsman Name:  RG Sharma



📊 Batsman Stats:
  batsman  total_runs  avg_runs  high_score  balls_faced  matches  fours  sixes  strike_rate
RG Sharma        1398   1.28022           6         1092       57    141     67   128.021978


In [3]:
import pandas as pd
import numpy as np

# Load dataset
# Ensure your CSV has all necessary columns (match_id, bowler, total_runs, is_wicket, over, ball)
df = pd.read_csv("deliveries.csv", skiprows=range(1, 193618))

# Filter out extras (only runs conceded off the bat count)
df['bowler_runs'] = df['total_runs'] - df['extra_runs']

# Count sixes in first two overs
first_two_overs = df[df['over'] < 2]
bowler_sixes = first_two_overs[first_two_overs['batsman_runs'] == 6].groupby('bowler')['batsman_runs'].count().reset_index()
bowler_sixes.columns = ['bowler', 'sixes_in_first_two_overs']

# Aggregate bowler statistics
bowler_stats = df.groupby('bowler').agg(
    matches=('match_id', 'nunique'),  # Matches played
    innings=('match_id', 'count'),    # Innings bowled
    balls=('ball', 'count'),          # Balls bowled
    runs=('bowler_runs', 'sum'),      # Runs conceded
    wickets=('is_wicket', 'sum')      # Wickets taken
).reset_index()

# Calculate bowling averages, economy rates, and strike rates
bowler_stats['avg'] = bowler_stats['runs'] / bowler_stats['wickets']
bowler_stats['econ'] = (bowler_stats['runs'] / bowler_stats['balls']) * 6
bowler_stats['sr'] = bowler_stats['balls'] / bowler_stats['wickets']

# Best Bowling in an Innings (BBI) and Best Bowling in a Match (BBM)
bbi = df.groupby(['bowler', 'match_id']).agg(wickets=('is_wicket', 'sum')).reset_index()
bbi = bbi.groupby('bowler')['wickets'].max().reset_index()
bbi.columns = ['bowler', 'BBI']

bbm = df.groupby(['bowler', 'match_id']).agg(wickets=('is_wicket', 'sum')).groupby('bowler')['wickets'].max().reset_index()
bbm.columns = ['bowler', 'BBM']

# Merge all statistics
final_stats = bowler_stats.merge(bbi, on='bowler', how='left').merge(bbm, on='bowler', how='left').merge(bowler_sixes, on='bowler', how='left')

# Fill NaN values for cases where bowlers have no sixes conceded in first two overs
final_stats['sixes_in_first_two_overs'].fillna(0, inplace=True)

# Save Stats
final_stats.to_csv("bowler_stats.csv", index=False)

# 🏏 Search for a specific bowler's stats
bowler_name = input("Enter Bowler Name: ").strip()

# Filter the stats for the entered bowler
player_data = final_stats[final_stats['bowler'].str.lower() == bowler_name.lower()]

if not player_data.empty:
    print("\n📊 Bowler Stats:")
    print(player_data.to_string(index=False))
else:
    print("\n❌ Bowler not found in the dataset!")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_stats['sixes_in_first_two_overs'].fillna(0, inplace=True)


Enter Bowler Name:  Sandeep Sharma



📊 Bowler Stats:
        bowler  matches  innings  balls  runs  wickets       avg     econ        sr  BBI  BBM  sixes_in_first_two_overs
Sandeep Sharma       34      797    797  1030       36 28.611111 7.754078 22.138889    5    5                       4.0


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Select Features & Target
X_bat = batsman_stats[['balls_faced', 'matches', 'strike_rate', 'fours', 'sixes']]
y_bat = batsman_stats['total_runs']

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X_bat, y_bat, test_size=0.2, random_state=42)

# Train Model
bat_model = RandomForestRegressor(n_estimators=100, random_state=42)
bat_model.fit(X_train, y_train)

# Predictions
y_pred = bat_model.predict(X_test)

# Model Evaluation
mse = mean_squared_error(y_test, y_pred)
print(f"Batsman Prediction Model MSE: {mse}")

# Predict Runs for New Player
new_batsman = np.array([[50, 10, 130, 8, 4]])  # Example: 50 balls, 10 matches, 130 SR, 8 fours, 4 sixes
predicted_runs = bat_model.predict(new_batsman)
print(f"Predicted Runs: {predicted_runs[0]}")


NameError: name 'batsman_stats' is not defined

In [12]:
# Select Features & Target
X_bowl = bowler_stats[['balls_bowled', 'economy', 'bowling_avg']]
y_bowl = bowler_stats['wickets']

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X_bowl, y_bowl, test_size=0.2, random_state=42)

# Train Model
bowl_model = RandomForestRegressor(n_estimators=100, random_state=42)
bowl_model.fit(X_train, y_train)

# Predictions
y_pred = bowl_model.predict(X_test)

# Model Evaluation
mse = mean_squared_error(y_test, y_pred)
print(f"Bowler Prediction Model MSE: {mse}")

# Predict Wickets for New Bowler
new_bowler = np.array([[60, 6.5, 25]])  # Example: 60 balls, 6.5 economy, 25 avg
predicted_wickets = bowl_model.predict(new_bowler)
print(f"Predicted Wickets: {predicted_wickets[0]}")


NameError: name 'bowler_stats' is not defined

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv("IPLStats3updcsv1.csv")  # Replace with actual dataset path

# Assigning weights based on row index
df['weight'] = 1.0  # Default weight
df.loc[75:145, 'weight'] = 1.5  # Medium importance
df.loc[146:, 'weight'] = 2.0  # Highest importance

# Encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_features = encoder.fit_transform(df[['Ground', 'Team A', 'Team B']])

# Prepare target variables
target_six_A = df['Six by A'] > 0  # Binary: 1 if A hit sixes, else 0
target_six_B = df['Six by B'] > 0  # Binary: 1 if B hit sixes, else 0
target_win_A = df['Result'] == df['Team A']  # 1 if Team A wins, else 0

# Splitting into train and test sets (keeping weights)
X_train, X_test, y_train_A, y_test_A, w_train_A, w_test_A = train_test_split(
    encoded_features, target_six_A, df['weight'], test_size=0.2, random_state=42
)
X_train, X_test, y_train_B, y_test_B, w_train_B, w_test_B = train_test_split(
    encoded_features, target_six_B, df['weight'], test_size=0.2, random_state=42
)
X_train, X_test, y_train_Win, y_test_Win, w_train_Win, w_test_Win = train_test_split(
    encoded_features, target_win_A, df['weight'], test_size=0.2, random_state=42
)

# Train models using sample weights
model_A = RandomForestClassifier(n_estimators=100, random_state=42)
model_A.fit(X_train, y_train_A, sample_weight=w_train_A)

model_B = RandomForestClassifier(n_estimators=100, random_state=42)
model_B.fit(X_train, y_train_B, sample_weight=w_train_B)

model_Win = RandomForestClassifier(n_estimators=100, random_state=42)
model_Win.fit(X_train, y_train_Win, sample_weight=w_train_Win)

# Function to predict probabilities
def predict_probabilities(ground, team_A, team_B):
    input_data = pd.DataFrame([[ground, team_A, team_B]], columns=['Ground', 'Team A', 'Team B'])
    input_encoded = encoder.transform(input_data).toarray()
    
    prob_A_six = model_A.predict_proba(input_encoded)[:, 1][0]
    prob_B_six = model_B.predict_proba(input_encoded)[:, 1][0]
    prob_A_win = model_Win.predict_proba(input_encoded)[:, 1][0]  # Probability of Team A winning
    prob_B_win = 1 - prob_A_win  # Probability of Team B winning

    return prob_A_six, prob_B_six, prob_A_win, prob_B_win

# Custom Input
ground = "Lucknow"
team_A = "LSG"
team_B = "MI"

# Predict
prob_A_six, prob_B_six, prob_A_win, prob_B_win = predict_probabilities(ground, team_A, team_B)
print(f"Probability of {team_A} hitting sixes: {prob_A_six:.2f}")
print(f"Probability of {team_B} hitting sixes: {prob_B_six:.2f}")
print(f"Probability of {team_A} winning: {prob_A_win:.2f}")
print(f"Probability of {team_B} winning: {prob_B_win:.2f}")


Probability of LSG hitting sixes: 0.71
Probability of MI hitting sixes: 0.80
Probability of LSG winning: 0.78
Probability of MI winning: 0.22
