In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

# Load the datasets
matches = pd.read_csv('IPL_Matches_2008_2022.csv')
ball_by_ball = pd.read_csv('IPL_Ball_by_Ball_2008_2022.csv')

# Aggregate player statistics for batters and bowlers
batter_stats = ball_by_ball.groupby('batter').agg({
    'batsman_run': 'sum',
    'isWicketDelivery': 'sum',
    'ballnumber': 'count'
}).reset_index()
batter_stats.columns = ['player', 'total_runs', 'total_wickets', 'balls_faced']

bowler_stats = ball_by_ball.groupby('bowler').agg({
    'isWicketDelivery': 'sum',
    'ballnumber': 'count',
    'total_run': 'sum'
}).reset_index()
bowler_stats.columns = ['player', 'total_wickets', 'balls_bowled', 'runs_conceded']

# Merge batter and bowler stats
player_stats = pd.merge(batter_stats, bowler_stats, on='player', how='outer').fillna(0)

# Add additional statistics if available, such as strike rate, economy rate, etc.
player_stats['strike_rate'] = player_stats['total_runs'] / player_stats['balls_faced'] * 100
player_stats['economy_rate'] = player_stats['runs_conceded'] / player_stats['balls_bowled'] * 6

# Prepare the comparison dataset
comparison_data = []
for idx, row in matches.iterrows():
    team1_players = row['Team1Players'].strip('][').split(', ')
    team2_players = row['Team2Players'].strip('][').split(', ')
    Venue = row['Venue']
    opposing_team = row['Team2'] if row['Team1'] == row['Team1'] else row['Team1']  # Determine opposing team based on team1 and team2 columns

    for player1 in team1_players:
        for player2 in team2_players:
            player1_stats = player_stats[player_stats['player'] == player1]
            player2_stats = player_stats[player_stats['player'] == player2]
            
            if not player1_stats.empty and not player2_stats.empty:
                comparison_data.append({
                    'player1': player1,
                    'player2': player2,
                    'Venue': Venue,
                    'opposing_team': opposing_team,
                    'player1_total_runs': player1_stats['total_runs'].values[0],
                    'player2_total_runs': player2_stats['total_runs'].values[0],
                    'player1_total_wickets': player1_stats['total_wickets'].values[0],
                    'player2_total_wickets': player2_stats['total_wickets'].values[0],
                    'player1_balls_faced': player1_stats['balls_faced'].values[0],
                    'player2_balls_faced': player2_stats['balls_faced'].values[0],
                    'player1_balls_bowled': player1_stats['balls_bowled'].values[0],
                    'player2_balls_bowled': player2_stats['balls_bowled'].values[0],
                    'target': 1 if player1_stats['total_runs'].values[0] > player2_stats['total_runs'].values[0] else 0  # Hypothetical target
                })

comparison_df = pd.DataFrame(comparison_data)

# One-hot encode categorical features
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_features = encoder.fit_transform(comparison_df[['Venue', 'opposing_team']])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['Venue', 'opposing_team']))

# Combine the encoded features with the rest of the dataset
comparison_df = comparison_df.drop(columns=['Venue', 'opposing_team'])
comparison_df = pd.concat([comparison_df, encoded_df], axis=1)

# Prepare features and labels
X = comparison_df.drop(columns=['player1', 'player2', 'target'])
y = comparison_df['target']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))


#predicting function for team comparision
def compare_teams(team1, team2, Venue, opposing_team):
    # Create feature DataFrame for all players
    comparison_features = []
    for player in team1 + team2:
        player_stats = player_stats[player_stats['player'] == player]
        if not player_stats.empty:
            comparison_features.append({
                'player_total_runs': player_stats['total_runs'].values[0],
                'player_total_wickets': player_stats['total_wickets'].values[0],
                'player_balls_faced': player_stats['balls_faced'].values[0],
                'player_balls_bowled': player_stats['balls_bowled'].values[0],
                'Venue': Venue,
                'opposing_team': opposing_team
            })

    comparison_df = pd.DataFrame(comparison_features)

    # One-hot encode Venue and opposing team
    encoded_features = encoder.transform(comparison_df[['Venue', 'opposing_team']])
    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['Venue', 'opposing_team']))

    # Combine the encoded features with the rest of the features
    comparison_df = comparison_df.drop(columns=['Venue', 'opposing_team'])
    comparison_df = pd.concat([comparison_df, encoded_df], axis=1)

    # Ensure columns match those used in training
    for col in X_train.columns:
        if col not in comparison_df.columns:
            comparison_df[col] = 0

    # Make predictions
    predictions = model.predict_proba(comparison_df)[:, 1]  # Probability of performing better
    comparison_df['player'] = team1 + team2
    comparison_df['performance_score'] = predictions

    # Rank players based on performance score
    ranked_players = comparison_df.sort_values(by='performance_score', ascending=False)

    # Select best 11 players
    best_11 = ranked_players.head(11)

    return best_11[['player', 'performance_score']]

# Example usage
team1 = ['Ishan Kishan', 'RG Sharma', 'SA Yadav', 'Tilak Varma', 'KA Pollard', 'TH David', 'DR Sams', 'M Ashwin', 'K Kartikeya', 'JJ Bumrah', 'RP Meredith']
team2 = ['PK Garg', 'Abhishek Sharma', 'RA Tripathi', 'AK Markram', 'N Pooran', 'Washington Sundar', 'R Shepherd', 'J Suchith', 'B Kumar', 'Umran Malik', 'Fazalhaq Farooqi']
Venue = 'MA Chidambaram Stadium, Chepauk'
opposing_team = 'Chennai Super Kings'

best_11_players = compare_teams(team1, team2, Venue, opposing_team)
print(best_11_players)


In [6]:
matches = pd.read_csv('IPL_Matches_2008_2022.csv')
ball_by_ball = pd.read_csv('IPL_Ball_by_Ball_2008_2022.csv')



Unnamed: 0,ID,innings,overs,ballnumber,batter,bowler,non-striker,extra_type,batsman_run,extras_run,total_run,non_boundary,isWicketDelivery,player_out,kind,fielders_involved,BattingTeam
0,1312200,1,0,1,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,0,0,0,,,,Rajasthan Royals
1,1312200,1,0,2,YBK Jaiswal,Mohammed Shami,JC Buttler,legbyes,0,1,1,0,0,,,,Rajasthan Royals
2,1312200,1,0,3,JC Buttler,Mohammed Shami,YBK Jaiswal,,1,0,1,0,0,,,,Rajasthan Royals
3,1312200,1,0,4,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,0,0,0,,,,Rajasthan Royals
4,1312200,1,0,5,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,0,0,0,,,,Rajasthan Royals


In [10]:
list(matches.columns)

['ID',
 'City',
 'Date',
 'Season',
 'MatchNumber',
 'Team1',
 'Team2',
 'Venue',
 'TossWinner',
 'TossDecision',
 'SuperOver',
 'WinningTeam',
 'WonBy',
 'Margin',
 'method',
 'Player_of_Match',
 'Team1Players',
 'Team2Players',
 'Umpire1',
 'Umpire2']

In [7]:
matches.head()

Unnamed: 0,ID,City,Date,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,method,Player_of_Match,Team1Players,Team2Players,Umpire1,Umpire2
0,1312200,Ahmedabad,2022-05-29,2022,Final,Rajasthan Royals,Gujarat Titans,"Narendra Modi Stadium, Ahmedabad",Rajasthan Royals,bat,N,Gujarat Titans,Wickets,7.0,,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan...",CB Gaffaney,Nitin Menon
1,1312199,Ahmedabad,2022-05-27,2022,Qualifier 2,Royal Challengers Bangalore,Rajasthan Royals,"Narendra Modi Stadium, Ahmedabad",Rajasthan Royals,field,N,Rajasthan Royals,Wickets,7.0,,JC Buttler,"['V Kohli', 'F du Plessis', 'RM Patidar', 'GJ ...","['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...",CB Gaffaney,Nitin Menon
2,1312198,Kolkata,2022-05-25,2022,Eliminator,Royal Challengers Bangalore,Lucknow Super Giants,"Eden Gardens, Kolkata",Lucknow Super Giants,field,N,Royal Challengers Bangalore,Runs,14.0,,RM Patidar,"['V Kohli', 'F du Plessis', 'RM Patidar', 'GJ ...","['Q de Kock', 'KL Rahul', 'M Vohra', 'DJ Hooda...",J Madanagopal,MA Gough
3,1312197,Kolkata,2022-05-24,2022,Qualifier 1,Rajasthan Royals,Gujarat Titans,"Eden Gardens, Kolkata",Gujarat Titans,field,N,Gujarat Titans,Wickets,7.0,,DA Miller,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan...",BNJ Oxenford,VK Sharma
4,1304116,Mumbai,2022-05-22,2022,70,Sunrisers Hyderabad,Punjab Kings,"Wankhede Stadium, Mumbai",Sunrisers Hyderabad,bat,N,Punjab Kings,Wickets,5.0,,Harpreet Brar,"['PK Garg', 'Abhishek Sharma', 'RA Tripathi', ...","['JM Bairstow', 'S Dhawan', 'M Shahrukh Khan',...",AK Chaudhary,NA Patwardhan


In [15]:
import pandas as pd

# Load the datasets
matches = pd.read_csv('IPL_Matches_2008_2022.csv')
ball_by_ball = pd.read_csv('IPL_Ball_by_Ball_2008_2022.csv')

# Select required columns from ball_by_ball dataset
ball_by_ball_filtered = ball_by_ball[['ID', 'innings', 'overs', 'batter', 'bowler', 'batsman_run', 'total_run', 'non_boundary', 'isWicketDelivery', 'player_out']]

# Select required columns from matches dataset
matches_filtered = matches[['ID', 'Season', 'Team1', 'Team2', 'Player_of_Match', 'Team1Players', 'Team2Players']]

# Merge the datasets on 'ID'
merged_dataset = pd.merge(ball_by_ball_filtered, matches_filtered, on='ID')

# Save the new dataset to a CSV file
merged_dataset.to_csv('new_dataset.csv', index=False)

print("New dataset created and saved as 'new_dataset.csv'")


New dataset created and saved as 'new_dataset.csv'


In [16]:
perf = pd.read_csv('new_dataset.csv')
perf.head()

  perf = pd.read_csv('new_dataset.csv')


Unnamed: 0,ID,innings,overs,batter,bowler,batsman_run,total_run,non_boundary,isWicketDelivery,player_out,Season,Team1,Team2,Player_of_Match,Team1Players,Team2Players
0,1312200,1,0,YBK Jaiswal,Mohammed Shami,0,0,0,0,,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."
1,1312200,1,0,YBK Jaiswal,Mohammed Shami,0,1,0,0,,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."
2,1312200,1,0,JC Buttler,Mohammed Shami,1,1,0,0,,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."
3,1312200,1,0,YBK Jaiswal,Mohammed Shami,0,0,0,0,,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."
4,1312200,1,0,YBK Jaiswal,Mohammed Shami,0,0,0,0,,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."


In [18]:
perf.drop('player_out', axis =1, inplace=True)
perf.head()

Unnamed: 0,ID,innings,overs,batter,bowler,batsman_run,total_run,non_boundary,isWicketDelivery,Season,Team1,Team2,Player_of_Match,Team1Players,Team2Players
0,1312200,1,0,YBK Jaiswal,Mohammed Shami,0,0,0,0,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."
1,1312200,1,0,YBK Jaiswal,Mohammed Shami,0,1,0,0,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."
2,1312200,1,0,JC Buttler,Mohammed Shami,1,1,0,0,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."
3,1312200,1,0,YBK Jaiswal,Mohammed Shami,0,0,0,0,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."
4,1312200,1,0,YBK Jaiswal,Mohammed Shami,0,0,0,0,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."


In [23]:
perf.drop('total_run', axis =1, inplace=True)
perf.drop('non_boundary', axis =1, inplace=True)

In [25]:
perf.head()

Unnamed: 0,ID,innings,overs,batter,bowler,batsman_run,isWicketDelivery,Season,Team1,Team2,Player_of_Match,Team1Players,Team2Players
0,1312200,1,0,YBK Jaiswal,Mohammed Shami,0,0,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."
1,1312200,1,0,YBK Jaiswal,Mohammed Shami,0,0,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."
2,1312200,1,0,JC Buttler,Mohammed Shami,1,0,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."
3,1312200,1,0,YBK Jaiswal,Mohammed Shami,0,0,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."
4,1312200,1,0,YBK Jaiswal,Mohammed Shami,0,0,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."


In [26]:
import pandas as pd

def get_batter_stats(dataframe, batter_name):
    # Filter the dataframe for the specified batter
    batter_df = dataframe[dataframe['batter'] == batter_name]
    
    # Calculate the total number of balls played and aggregated sum of runs scored
    total_balls_played = len(batter_df)
    total_runs_scored = batter_df['batsman_run'].sum()
    
    return total_balls_played, total_runs_scored

# Example usage:
# Assuming your final dataset is stored in a variable called 'final_dataset'
total_balls, total_runs = get_batter_stats(perf, 'YBK Jaiswal')
print(f"{total_balls} balls played, {total_runs} runs scored by YBK Jaiswal.")


410 balls played, 547 runs scored by YBK Jaiswal.


In [28]:
import pandas as pd

def get_batter_stats(dataframe, batter_name):
    # Filter the dataframe for the specified batter
    batter_df = dataframe[dataframe['batter'] == batter_name]
    
    # Calculate the total number of balls played, aggregated sum of runs scored, and count of 4s and 6s
    total_balls_played = len(batter_df)
    total_runs_scored = batter_df['batsman_run'].sum()
    fours_count = len(batter_df[batter_df['batsman_run'] == 4])
    sixes_count = len(batter_df[batter_df['batsman_run'] == 6])
    
    return total_balls_played, total_runs_scored, fours_count, sixes_count

# Example usage:
# Assuming your final dataset is stored in a variable called 'final_dataset'
total_balls, total_runs, fours, sixes = get_batter_stats(perf, 'YBK Jaiswal')
print(f"{total_balls} balls played, {total_runs} runs scored, {fours} fours, and {sixes} sixes hit by YBK Jaiswal.")


410 balls played, 547 runs scored, 62 fours, and 22 sixes hit by YBK Jaiswal.


## 4, 6, half and Centuries

In [39]:
import pandas as pd

def get_batter_stats(dataframe, batter_name):
    # Filter the dataframe for the specified batter
    batter_df = dataframe[dataframe['batter'] == batter_name]
    
    # Group the dataframe by match ID and aggregate the runs scored in each match
    match_stats = batter_df.groupby('ID').agg({
        'batsman_run': 'sum',  # Total runs scored in each match
        'isWicketDelivery': 'sum'  # Count of wickets taken by the bowler in each match
    }).reset_index()  # Resetting index to have 'ID' as a column
    
    # Calculate the total number of balls played, aggregated sum of runs scored, count of 4s and 6s, and count of half-centuries and centuries
    total_balls_played = len(batter_df)
    total_runs_scored = batter_df['batsman_run'].sum()
    fours_count = len(batter_df[batter_df['batsman_run'] == 4])
    sixes_count = len(batter_df[batter_df['batsman_run'] == 6])
    
    # Count half-centuries (50s) and centuries (100s) per match
    half_centuries = sum(match_stats['batsman_run'] >= 50)
    centuries = sum(match_stats['batsman_run'] >= 100)
    
    return total_balls_played, total_runs_scored, fours_count, sixes_count, half_centuries, centuries, batter_name

# Example usage:
# Assuming your final dataset is stored in a variable called 'final_dataset'
total_balls, total_runs, fours, sixes, half_centuries, centuries, batter_name = get_batter_stats(perf, 'Shubman Gill')
print(f"{total_balls} balls played, {total_runs} runs scored, {fours} fours, {sixes} sixes, {half_centuries} half-centuries, and {centuries} centuries by {batter_name}.")


1555 balls played, 1900 runs scored, 188 fours, 47 sixes, 14 half-centuries, and 0 centuries by Shubman Gill.


In [35]:
batter_df = perf[perf['batter'] == 'Shubman Gill']
matches_grouped = batter_df.groupby('ID')
matches_grouped.head()

Unnamed: 0,ID,innings,overs,batter,bowler,batsman_run,isWicketDelivery,Season,Team1,Team2,Player_of_Match,Team1Players,Team2Players
123,1312200,2,0,Shubman Gill,TA Boult,0,0,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."
124,1312200,2,0,Shubman Gill,TA Boult,0,0,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."
125,1312200,2,0,Shubman Gill,TA Boult,4,0,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."
132,1312200,2,2,Shubman Gill,TA Boult,0,0,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."
133,1312200,2,2,Shubman Gill,TA Boult,0,0,2022,Rajasthan Royals,Gujarat Titans,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
73262,1136570,1,13,Shubman Gill,B Stanlake,0,0,2018,Kolkata Knight Riders,Sunrisers Hyderabad,B Stanlake,"['RV Uthappa', 'CA Lynn', 'N Rana', 'SP Narine...","['WP Saha', 'S Dhawan', 'KS Williamson', 'MK P..."
73263,1136570,1,13,Shubman Gill,B Stanlake,0,0,2018,Kolkata Knight Riders,Sunrisers Hyderabad,B Stanlake,"['RV Uthappa', 'CA Lynn', 'N Rana', 'SP Narine...","['WP Saha', 'S Dhawan', 'KS Williamson', 'MK P..."
73268,1136570,1,14,Shubman Gill,S Kaul,1,0,2018,Kolkata Knight Riders,Sunrisers Hyderabad,B Stanlake,"['RV Uthappa', 'CA Lynn', 'N Rana', 'SP Narine...","['WP Saha', 'S Dhawan', 'KS Williamson', 'MK P..."
73271,1136570,1,15,Shubman Gill,Rashid Khan,1,0,2018,Kolkata Knight Riders,Sunrisers Hyderabad,B Stanlake,"['RV Uthappa', 'CA Lynn', 'N Rana', 'SP Narine...","['WP Saha', 'S Dhawan', 'KS Williamson', 'MK P..."


In [27]:
import pandas as pd

def get_bowler_stats(dataframe, bowler_name):
    # Filter the dataframe for the specified bowler
    bowler_df = dataframe[dataframe['bowler'] == bowler_name]
    
    # Calculate the total number of balls bowled and total wickets taken
    total_balls_bowled = len(bowler_df)
    total_wickets_taken = bowler_df['isWicketDelivery'].sum()
    
    return total_balls_bowled, total_wickets_taken

# Example usage:
# Assuming your final dataset is stored in a variable called 'final_dataset'
total_balls_bowled, total_wickets = get_bowler_stats(perf, 'Mohammed Shami')
print(f"{total_balls_bowled} balls bowled, {total_wickets} wickets taken by Mohammed Shami.")


2118 balls bowled, 116 wickets taken by Mohammed Shami.


## 3wickets,4wickets, 5wickets and maiden overs

In [40]:
import pandas as pd

def get_bowler_stats(dataframe, bowler_name):
    # Filter the dataframe for the specified bowler
    bowler_df = dataframe[dataframe['bowler'] == bowler_name]
    
    # Calculate the total number of balls bowled and total wickets taken
    total_balls_bowled = len(bowler_df)
    total_wickets_taken = bowler_df['isWicketDelivery'].sum()
    
    # Group the dataframe by match ID and calculate wickets taken in each match
    match_stats = bowler_df.groupby('ID').agg({
        'isWicketDelivery': 'sum',  # Total wickets taken in each match
        'overs': 'sum'  # Total overs bowled in each match
    }).reset_index()  # Resetting index to have 'ID' as a column
    
    # Count matches with 3, 4, and 5 wickets taken, and count of maiden overs
    matches_3_wickets = sum(match_stats['isWicketDelivery'] == 3)
    matches_4_wickets = sum(match_stats['isWicketDelivery'] == 4)
    matches_5_wickets = sum(match_stats['isWicketDelivery'] == 5)

    # Adjust counts based on matches with 4 and 5 wickets
    matches_3_wickets -= matches_4_wickets
    matches_4_wickets -= matches_5_wickets
    
    # Count maiden overs by grouping by match ID and overs
    maiden_overs = len(bowler_df[bowler_df['batsman_run'] == 0].groupby(['ID', 'overs']))
    
    return total_balls_bowled, total_wickets_taken, matches_3_wickets, matches_4_wickets, matches_5_wickets, maiden_overs

# Example usage:
# Assuming your final dataset is stored in a variable called 'final_dataset'
total_balls, total_wickets, matches_3_wickets, matches_4_wickets, matches_5_wickets, maiden_overs = get_bowler_stats(perf, 'Mohammed Shami')
print(f"{total_balls} balls bowled, {total_wickets} wickets taken.")
print(f"{matches_3_wickets} matches with 3 wickets, {matches_4_wickets} matches with 4 wickets, {matches_5_wickets} matches with 5 wickets.")
print(f"{maiden_overs} maiden overs bowled.")


2118 balls bowled, 116 wickets taken.
9 matches with 3 wickets, 1 matches with 4 wickets, 0 matches with 5 wickets.
320 maiden overs bowled.


## all unique players

In [43]:
import pandas as pd

# Assuming your dataset is loaded into a variable called 'final_dataset'
# Replace 'final_dataset' with the actual variable name if different

# Get all unique players from 'Team1Players' and 'Team2Players' columns
all_players_team1 = perf['Team1Players'].explode().unique()
all_players_team2 = perf['Team2Players'].explode().unique()

# Create a set to store all unique players
all_players_set = set()

# Add players from both teams to the set (ensures no duplicates)
all_players_set.update(all_players_team1)
all_players_set.update(all_players_team2)

# Convert the set back to a list for printing (if needed)
all_players = list(all_players_set)

# Print the list of all players and count of players
print(f"Number of unique players in the dataset: {len(all_players)}")
print("All players in the dataset:")
for player in all_players:
    print(player)


Number of unique players in the dataset: 1845
All players in the dataset:
['SR Watson', 'Vishnu Vinod', 'AB de Villiers', 'KM Jadhav', 'Mandeep Singh', 'STR Binny', 'P Negi', 'Iqbal Abdulla', 'TS Mills', 'B Stanlake', 'YS Chahal']
['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D Padikkal', 'SO Hetmyer', 'R Ashwin', 'R Parag', 'TA Boult', 'OC McCoy', 'M Prasidh Krishna', 'YS Chahal']
['RV Uthappa', 'G Gambhir', 'MK Pandey', 'SA Yadav', 'YK Pathan', 'AD Russell', 'Shakib Al Hasan', 'PP Chawla', 'SP Narine', 'KC Cariappa', 'M Morkel']
['PK Garg', 'Abhishek Sharma', 'RA Tripathi', 'AK Markram', 'N Pooran', 'Washington Sundar', 'R Shepherd', 'J Suchith', 'B Kumar', 'Umran Malik', 'Fazalhaq Farooqi']
['S Sriram', 'JH Kallis', 'R Dravid', 'RV Uthappa', 'LRPL Taylor', 'CL White', 'V Kohli', 'R Vinay Kumar', 'P Kumar', 'A Kumble', 'DW Steyn']
['CA Lynn', 'Shubman Gill', 'RV Uthappa', 'N Rana', 'KD Karthik', 'AD Russell', 'SP Narine', 'PP Chawla', 'MG Johnson', 'M Prasidh Krishna', 'Kuldeep Yadav']

In [56]:
perf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225954 entries, 0 to 225953
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   ID                225954 non-null  int64 
 1   innings           225954 non-null  int64 
 2   overs             225954 non-null  int64 
 3   batter            225954 non-null  object
 4   bowler            225954 non-null  object
 5   batsman_run       225954 non-null  int64 
 6   isWicketDelivery  225954 non-null  int64 
 7   Season            225954 non-null  object
 8   Team1             225954 non-null  object
 9   Team2             225954 non-null  object
 10  Player_of_Match   225582 non-null  object
 11  Team1Players      225954 non-null  object
 12  Team2Players      225954 non-null  object
dtypes: int64(5), object(8)
memory usage: 22.4+ MB


## Players dataset

In [57]:
import pandas as pd

# Define the functions to get batter and bowler stats
def get_batter_stats(dataframe, batter_name):
    batter_df = dataframe[dataframe['batter'] == batter_name]
    match_stats = batter_df.groupby('ID').agg({
        'batsman_run': 'sum',
        'isWicketDelivery': 'sum'
    }).reset_index()
    total_balls_played = len(batter_df)
    total_runs_scored = batter_df['batsman_run'].sum()
    fours_count = len(batter_df[batter_df['batsman_run'] == 4])
    sixes_count = len(batter_df[batter_df['batsman_run'] == 6])
    half_centuries = sum(match_stats['batsman_run'] >= 50)
    centuries = sum(match_stats['batsman_run'] >= 100)
    return [batter_name, total_balls_played, total_runs_scored, fours_count, sixes_count, half_centuries, centuries]

def get_bowler_stats(dataframe, bowler_name):
    bowler_df = dataframe[dataframe['bowler'] == bowler_name]
    match_stats = bowler_df.groupby('ID').agg({
        'isWicketDelivery': 'sum',
        'overs': 'sum'
    }).reset_index()
    total_balls_bowled = len(bowler_df)
    total_wickets_taken = bowler_df['isWicketDelivery'].sum()
    matches_3_wickets = sum(match_stats['isWicketDelivery'] == 3)
    matches_4_wickets = sum(match_stats['isWicketDelivery'] == 4)
    matches_5_wickets = sum(match_stats['isWicketDelivery'] == 5)
    matches_3_wickets -= matches_4_wickets
    matches_4_wickets -= matches_5_wickets
    maiden_overs = len(bowler_df[bowler_df['batsman_run'] == 0].groupby(['ID', 'overs']))
    return [bowler_name, total_balls_bowled, total_wickets_taken, matches_3_wickets, matches_4_wickets, matches_5_wickets, maiden_overs]

# Assuming 'perf' is your final dataset DataFrame
# Initialize lists to store player stats
player_stats = []

# Loop through all players and calculate their stats
for player in all_players:
    if player in perf['batter'].unique():
        batter_stats = get_batter_stats(perf, player)
        player_stats.append(batter_stats)
    elif player in perf['bowler'].unique():
        bowler_stats = get_bowler_stats(perf, player)
        player_stats.append(bowler_stats)

# Create a DataFrame from player_stats
columns = ['Player', 'Balls Played', 'Runs Scored', 'Fours', 'Sixes', 'Half Centuries', 'Centuries',
           'Balls Bowled', 'Wickets Taken', '3 Wickets Matches', '4 Wickets Matches', '5 Wickets Matches',
           'Maiden Overs']
final_df = pd.DataFrame(player_stats, columns=columns)

# Save the final dataset to a CSV file
final_df.to_csv('player.csv', index=False)

print("Final dataset created and saved as 'player.csv'")


Final dataset created and saved as 'player.csv'


In [58]:
import pandas as pd
players = pd.read_csv('player.csv')
players.shape

(0, 13)

In [52]:
import ast

# Convert strings to lists of player names
all_playerslist = [ast.literal_eval(player_str) for player_str in all_players]

# Flatten the list of lists into a single list
all_playerslist = [player for sublist in all_playerslist for player in sublist]

# Remove duplicates
all_playerslist = list(set(all_playerslist))

print("All players list:")
print(all_playerslist)
print("Number of unique players:", len(all_playerslist))


All players list:
['Tejas Baroka', 'PP Shaw', 'R Dravid', 'P Chopra', 'AR Patel', 'Z Khan', 'Sandeep Sharma', 'TH David', 'TP Sudhindra', 'P Negi', 'SM Boland', 'JW Hastings', 'M Jansen', 'Shivam Sharma', 'KA Jamieson', 'JL Denly', 'RG More', 'S Kaushik', 'Y Venugopal Rao', 'TL Seifert', 'R Sai Kishore', 'TS Mills', 'SC Kuggeleijn', 'BB Samantray', 'UT Khawaja', 'JA Morkel', 'MA Agarwal', 'BJ Haddin', 'P Amarnath', 'R Rampaul', 'S Sandeep Warrier', 'ML Hayden', 'Joginder Sharma', 'CK Langeveldt', 'CR Brathwaite', 'Avesh Khan', 'JH Kallis', 'WPUJC Vaas', 'N Rana', 'JA Richardson', 'AJ Tye', 'RD Chahar', 'Harbhajan Singh', 'RA Jadeja', 'Akash Deep', 'B Geeves', 'RN ten Doeschate', 'BJ Rohrer', 'Ishan Kishan', 'SN Khan', 'AA Jhunjhunwala', 'RJ Peterson', 'Umran Malik', 'A Manohar', 'GR Napier', 'KH Devdhar', 'S Dube', 'RR Bose', 'Abdur Razzak', 'JDS Neesham', 'SM Katich', 'D Pretorius', 'WP Saha', 'DJ Hooda', 'MR Marsh', 'Azhar Mahmood', 'RK Singh', 'DJM Short', 'RS Sodhi', 'H Das', 'DT C

In [59]:
import pandas as pd

# Assuming 'perf' is your DataFrame containing match and player data
# List of all unique player names
all_players = perf['batter'].unique().tolist()

# Define the function to get batter stats
def get_batter_stats(dataframe, batter_name):
    batter_df = dataframe[dataframe['batter'] == batter_name]
    match_stats = batter_df.groupby('ID').agg({
        'batsman_run': 'sum',
        'isWicketDelivery': 'sum'
    }).reset_index()
    total_balls_played = len(batter_df)
    total_runs_scored = batter_df['batsman_run'].sum()
    fours_count = len(batter_df[batter_df['batsman_run'] == 4])
    sixes_count = len(batter_df[batter_df['batsman_run'] == 6])
    half_centuries = sum(match_stats['batsman_run'] >= 50)
    centuries = sum(match_stats['batsman_run'] >= 100)
    return [batter_name, total_balls_played, total_runs_scored, fours_count, sixes_count, half_centuries, centuries]

# Initialize a list to store player stats
player_stats = []

# Loop through all players and calculate their stats
for player in all_players:
    batter_stats = get_batter_stats(perf, player)
    player_stats.append(batter_stats)

# Create a DataFrame from player_stats
columns = ['Player', 'Balls Played', 'Runs Scored', 'Fours', 'Sixes', 'Half Centuries', 'Centuries']
batter_stats_df = pd.DataFrame(player_stats, columns=columns)

# Save the batter stats dataset to a CSV file
batter_stats_df.to_csv('batter_stats.csv', index=False)

print("Batter stats dataset created and saved as 'batter_stats.csv'")


Batter stats dataset created and saved as 'batter_stats.csv'


## final batter stat

In [60]:
import pandas as pd

# Assuming 'perf' is your DataFrame containing match and player data
# List of all unique player names
all_players = perf['batter'].unique().tolist()

# Define the function to get batter stats
def get_batter_stats(dataframe, batter_name):
    batter_df = dataframe[dataframe['batter'] == batter_name]
    match_stats = batter_df.groupby('ID').agg({
        'batsman_run': 'sum',
        'isWicketDelivery': 'sum'
    }).reset_index()
    total_balls_played = len(batter_df)
    total_runs_scored = batter_df['batsman_run'].sum()
    fours_count = len(batter_df[batter_df['batsman_run'] == 4])
    sixes_count = len(batter_df[batter_df['batsman_run'] == 6])
    
    # Initialize counters for half centuries and centuries
    half_centuries = 0
    centuries = 0
    
    # Iterate through match_stats to count half centuries and centuries
    for index, row in match_stats.iterrows():
        runs_in_match = row['batsman_run']
        if runs_in_match >= 100:
            centuries += 1
        elif runs_in_match >= 50:
            half_centuries += 1

    return [batter_name, total_balls_played, total_runs_scored, fours_count, sixes_count, half_centuries, centuries]

# Initialize a list to store player stats
player_stats = []

# Loop through all players and calculate their stats
for player in all_players:
    updated_batter_stats = get_batter_stats(perf, player)
    player_stats.append(updated_batter_stats)

# Create a DataFrame from player_stats
columns = ['Player', 'Balls Played', 'Runs Scored', 'Fours', 'Sixes', 'Half Centuries', 'Centuries']
updated_batter_stats_df = pd.DataFrame(player_stats, columns=columns)

# Save the batter stats dataset to a CSV file
updated_batter_stats_df.to_csv('updated_batter_stats.csv', index=False)

print("Batter stats dataset created and saved as 'updated_batter_stats.csv'")


Batter stats dataset created and saved as 'updated_batter_stats.csv'


## final bowler stat

In [61]:
import pandas as pd

# Assuming 'perf' is your DataFrame containing match and player data
# List of all unique bowler names
all_bowlers = perf['bowler'].unique().tolist()

# Define the function to get bowler stats
def get_bowler_stats(dataframe, bowler_name):
    bowler_df = dataframe[dataframe['bowler'] == bowler_name]
    total_balls_bowled = len(bowler_df)
    total_wickets_taken = bowler_df['isWicketDelivery'].sum()
    
    # Group the dataframe by match ID and calculate wickets taken in each match
    match_stats = bowler_df.groupby('ID').agg({
        'isWicketDelivery': 'sum',  # Total wickets taken in each match
        'overs': 'sum'  # Total overs bowled in each match
    }).reset_index()  # Resetting index to have 'ID' as a column
    
    # Count matches with 3, 4, and 5 wickets taken, and count of maiden overs
    matches_3_wickets = sum(match_stats['isWicketDelivery'] == 3)
    matches_4_wickets = sum(match_stats['isWicketDelivery'] == 4)
    matches_5_wickets = sum(match_stats['isWicketDelivery'] == 5)

    # Adjust counts based on matches with 4 and 5 wickets
    matches_3_wickets -= matches_4_wickets
    matches_4_wickets -= matches_5_wickets
    
    # Count maiden overs by grouping by match ID and overs
    maiden_overs = len(bowler_df[bowler_df['batsman_run'] == 0].groupby(['ID', 'overs']))
    
    return [bowler_name, total_balls_bowled, total_wickets_taken, matches_3_wickets, matches_4_wickets, matches_5_wickets, maiden_overs]

# Initialize a list to store bowler stats
bowler_stats = []

# Loop through all bowlers and calculate their stats
for bowler in all_bowlers:
    bowler_data = get_bowler_stats(perf, bowler)
    bowler_stats.append(bowler_data)

# Create a DataFrame from bowler_stats
columns = ['Bowler', 'Balls Bowled', 'Wickets Taken', '3 Wickets Matches', '4 Wickets Matches', '5 Wickets Matches', 'Maiden Overs']
bowler_stats_df = pd.DataFrame(bowler_stats, columns=columns)

# Save the bowler stats dataset to a CSV file
bowler_stats_df.to_csv('bowler_stats.csv', index=False)

print("Bowler stats dataset created and saved as 'bowler_stats.csv'")


Bowler stats dataset created and saved as 'bowler_stats.csv'


In [62]:
import pandas as pd

# Load the updated batter stats and bowler stats datasets
batter_stats_df = pd.read_csv('updated_batter_stats.csv')
bowler_stats_df = pd.read_csv('bowler_stats.csv')

# Merge the datasets based on the player's name (outer join to include all players)
merged_stats_df = pd.merge(batter_stats_df, bowler_stats_df, how='outer', left_on='Player', right_on='Bowler')

# Rename the columns to remove duplicate columns for player's name
merged_stats_df = merged_stats_df.rename(columns={'Player': 'Name'})

# Save the merged dataset to a CSV file
merged_stats_df.to_csv('combined_stats.csv', index=False)

print("Combined stats dataset created and saved as 'combined_stats.csv'")


Combined stats dataset created and saved as 'combined_stats.csv'


In [63]:
import pandas as pd

# Load the combined dataset
combined_stats_df = pd.read_csv('combined_stats.csv')

# Replace empty values with NaN (null)
combined_stats_df = combined_stats_df.replace('', pd.NA)

# Fill missing values in the 'Name' column with the bowler's name
combined_stats_df['Name'] = combined_stats_df['Name'].fillna(combined_stats_df['Bowler'])

# Drop the 'Bowler' column as it's redundant now
combined_stats_df = combined_stats_df.drop(columns='Bowler')

# Save the updated dataset to a CSV file
combined_stats_df.to_csv('updated_combined_stats.csv', index=False)

print("Updated combined stats dataset created and saved as 'updated_combined_stats.csv'")


Updated combined stats dataset created and saved as 'updated_combined_stats.csv'


In [64]:
import pandas as pd

# Load the combined dataset
combined_stats_df = pd.read_csv('updated_combined_stats.csv')

# Replace empty values with 0
combined_stats_df = combined_stats_df.replace('', 0)

# Save the updated dataset back to the same CSV file
combined_stats_df.to_csv('updated_combined_stats.csv', index=False)

print("Updated combined stats dataset and saved to 'updated_combined_stats.csv'")


Updated combined stats dataset and saved to 'updated_combined_stats.csv'


In [65]:
uc = pd.read_csv('updated_combined_stats.csv')
uc.head()

Unnamed: 0,Name,Balls Played,Runs Scored,Fours,Sixes,Half Centuries,Centuries,Balls Bowled,Wickets Taken,3 Wickets Matches,4 Wickets Matches,5 Wickets Matches,Maiden Overs
0,A Ashish Reddy,196.0,280.0,16.0,15.0,0.0,0.0,270.0,19.0,1.0,0.0,0.0,40.0
1,A Badoni,139.0,161.0,11.0,7.0,1.0,0.0,12.0,2.0,0.0,0.0,0.0,2.0
2,A Chandila,7.0,4.0,0.0,0.0,0.0,0.0,234.0,11.0,-1.0,1.0,0.0,36.0
3,A Chopra,75.0,53.0,7.0,0.0,0.0,0.0,,,,,,
4,A Choudhary,20.0,25.0,1.0,1.0,0.0,0.0,108.0,5.0,0.0,0.0,0.0,16.0


## performance matrices 

In [66]:
import pandas as pd

# Load the dataset
uc = pd.read_csv('updated_combined_stats.csv')

# Replace empty values with 0
uc = uc.fillna(0)

# Define the batting performance metric
uc['BattingPerformance'] = (uc['Runs Scored'] + 
                             1 * uc['Fours'] + 
                             2 * uc['Sixes'] + 
                             8 * uc['Half Centuries'] + 
                             16 * uc['Centuries'])

# Define the bowling performance metric
uc['BowlingPerformance'] = (uc['Wickets Taken'] * 25 + 
                            4 * uc['3 Wickets Matches'] + 
                            8 * uc['4 Wickets Matches'] + 
                            16 * uc['5 Wickets Matches'] + 
                            12 * uc['Maiden Overs'])

# Save the updated dataset
uc.to_csv('updated_combined_stats.csv', index=False)

print("Updated combined stats dataset with performance metrics saved to 'updated_combined_stats.csv'")


Updated combined stats dataset with performance metrics saved to 'updated_combined_stats.csv'


In [67]:
uc = pd.read_csv('updated_combined_stats.csv')
uc.head()

Unnamed: 0,Name,Balls Played,Runs Scored,Fours,Sixes,Half Centuries,Centuries,Balls Bowled,Wickets Taken,3 Wickets Matches,4 Wickets Matches,5 Wickets Matches,Maiden Overs,BattingPerformance,BowlingPerformance
0,A Ashish Reddy,196.0,280.0,16.0,15.0,0.0,0.0,270.0,19.0,1.0,0.0,0.0,40.0,326.0,959.0
1,A Badoni,139.0,161.0,11.0,7.0,1.0,0.0,12.0,2.0,0.0,0.0,0.0,2.0,194.0,74.0
2,A Chandila,7.0,4.0,0.0,0.0,0.0,0.0,234.0,11.0,-1.0,1.0,0.0,36.0,4.0,711.0
3,A Chopra,75.0,53.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0
4,A Choudhary,20.0,25.0,1.0,1.0,0.0,0.0,108.0,5.0,0.0,0.0,0.0,16.0,28.0,317.0


In [69]:
import pandas as pd

# Load the dataset
uc = pd.read_csv('updated_combined_stats.csv')

# Replace empty values with 0
uc = uc.fillna(0)

def get_top_players(player_names):
    # Filter the dataset for the given player names
    selected_players = uc[uc['Name'].isin(player_names)]
    
    # Sort players by BattingPerformance and select top 6 batsmen
    top_batsmen = selected_players.sort_values(by='BattingPerformance', ascending=False).head(6)
    
    # Sort players by BowlingPerformance and select top 5 bowlers
    top_bowlers = selected_players.sort_values(by='BowlingPerformance', ascending=False).head(5)
    
    # Extract relevant columns for the output
    top_batsmen = top_batsmen[['Name', 'BattingPerformance', 'BowlingPerformance']]
    top_bowlers = top_bowlers[['Name', 'BattingPerformance', 'BowlingPerformance']]
    
    return top_batsmen, top_bowlers

# Example usage
player_names = ["A Ashish Reddy", "A Badoni", "A Chandila", "A Chopra", "A Choudhary", 
                "A Dananjaya", "A Flintoff", "A Kumble", "A Manohar", "A Mishra", 
                "A Mithun", "A Mukund", "A Nehra", "A Nel", "A Nortje", 
                "A Patel", "A Rahane", "A Roy", "A Sharma", "A Singh", 
                "A Tye", "A Zampa"]

top_batsmen, top_bowlers = get_top_players(player_names)

print("Top 6 Batsmen:")
print(top_batsmen)

print("\nTop 5 Bowlers:")
print(top_bowlers)


Top 6 Batsmen:
             Name  BattingPerformance  BowlingPerformance
9        A Mishra               401.0             10327.0
0  A Ashish Reddy               326.0               959.0
1        A Badoni               194.0                74.0
8       A Manohar               128.0                 0.0
6      A Flintoff                71.0               158.0
3        A Chopra                60.0                 0.0

Top 5 Bowlers:
        Name  BattingPerformance  BowlingPerformance
9   A Mishra               401.0             10327.0
12   A Nehra                46.0              6761.0
7   A Kumble                38.0              3101.0
14  A Nortje                 8.0              2468.0
15   A Singh                 2.0              1711.0


In [71]:
import pandas as pd

# Load the dataset
uc = pd.read_csv('updated_combined_stats.csv')

# Replace empty values with 0
uc = uc.fillna(0)

def get_top_players(player_names):
    # Filter the dataset for the given player names
    selected_players = uc[uc['Name'].isin(player_names)]
    
    # Determine the primary skill (batting or bowling) for each player
    selected_players['PrimarySkill'] = selected_players.apply(
        lambda row: 'Batting' if row['BattingPerformance'] >= row['BowlingPerformance'] else 'Bowling', axis=1
    )
    
    # Separate players based on their primary skill
    top_batsmen_candidates = selected_players[selected_players['PrimarySkill'] == 'Batting']
    top_bowlers_candidates = selected_players[selected_players['PrimarySkill'] == 'Bowling']
    
    # Sort and select top 6 batsmen
    top_batsmen = top_batsmen_candidates.sort_values(by='BattingPerformance', ascending=False).head(6)
    
    # Sort and select top 5 bowlers
    top_bowlers = top_bowlers_candidates.sort_values(by='BowlingPerformance', ascending=False).head(5)
    
    # Extract relevant columns for the output
    top_batsmen = top_batsmen[['Name', 'BattingPerformance', 'BowlingPerformance']]
    top_bowlers = top_bowlers[['Name', 'BattingPerformance', 'BowlingPerformance']]
    
    return top_batsmen, top_bowlers

# Example usage
player_names = ["A Ashish Reddy", "A Badoni", "A Chandila", "A Chopra", "A Choudhary", 
                "A Dananjaya", "A Flintoff", "A Kumble", "A Manohar", "A Mishra", 
                "A Mithun", "A Mukund", "A Nehra", "A Nel", "A Nortje", 
                "A Patel", "A Rahane", "A Roy", "A Sharma", "A Singh", 
                "A Tye", "A Zampa"]

top_batsmen, top_bowlers = get_top_players(player_names)

print("Top 6 Batsmen:")
print(top_batsmen)

print("\nTop 5 Bowlers:")
print(top_bowlers)


Top 6 Batsmen:
         Name  BattingPerformance  BowlingPerformance
1    A Badoni               194.0                74.0
8   A Manohar               128.0                 0.0
3    A Chopra                60.0                 0.0
11   A Mukund                20.0                 0.0

Top 5 Bowlers:
        Name  BattingPerformance  BowlingPerformance
9   A Mishra               401.0             10327.0
12   A Nehra                46.0              6761.0
7   A Kumble                38.0              3101.0
14  A Nortje                 8.0              2468.0
15   A Singh                 2.0              1711.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_players['PrimarySkill'] = selected_players.apply(


## TelegramBot

In [None]:
import telebot
import pandas as pd

# Your Telegram bot token
BOT_TOKEN = "Place your telegram bot token here"

# Initialize the bot
bot = telebot.TeleBot(BOT_TOKEN)

# Load and preprocess the dataset
uc = pd.read_csv('updated_combined_stats.csv')
uc = uc.fillna(0)

def get_top_players(player_names):
    selected_players = uc[uc['Name'].isin(player_names)]
    selected_players['PrimarySkill'] = selected_players.apply(
        lambda row: 'Batting' if row['BattingPerformance'] >= row['BowlingPerformance'] else 'Bowling', axis=1
    )
    top_batsmen_candidates = selected_players[selected_players['PrimarySkill'] == 'Batting']
    top_bowlers_candidates = selected_players[selected_players['PrimarySkill'] == 'Bowling']
    top_batsmen = top_batsmen_candidates.sort_values(by='BattingPerformance', ascending=False).head(6)
    top_bowlers = top_bowlers_candidates.sort_values(by='BowlingPerformance', ascending=False).head(5)
    top_batsmen = top_batsmen[['Name', 'BattingPerformance']]
    top_bowlers = top_bowlers[['Name', 'BowlingPerformance']]
    return top_batsmen, top_bowlers

@bot.message_handler(commands=['start', 'hello'])
def send_welcome(message):
    bot.reply_to(message, "Radhey Radhey!, Welcome to Tech Nirvana")

@bot.message_handler(commands=['get_players'])
def ask_for_players(message):
    msg = bot.reply_to(message, "Please provide a list of player names separated by commas.")
    bot.register_next_step_handler(msg, process_players)

def process_players(message):
    try:
        player_names = [name.strip() for name in message.text.split(',')]
        top_batsmen, top_bowlers = get_top_players(player_names)
        response = "Top 6 Batsmen:\n" + top_batsmen.to_string(index=False) + "\n\nTop 5 Bowlers:\n" + top_bowlers.to_string(index=False)
        bot.reply_to(message, response)
    except Exception as e:
        bot.reply_to(message, f"An error occurred: {str(e)}")

@bot.message_handler(func=lambda msg: True)
def echo_all(message):
    bot.reply_to(message, "Please use the command /get_players to start.")

bot.infinity_polling()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_players['PrimarySkill'] = selected_players.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_players['PrimarySkill'] = selected_players.apply(
