Step 1: Load and Explore the Data

In [1]:
import pandas as pd

# Load the primary train data
primary_train_path = './train_data_with_samplefeatures.csv'  # Replace with the actual file path
primary_train_df = pd.read_csv(primary_train_path)

# Load the batsman-level data
batsman_data_path = './batsman_level_scorecard.csv'  # Replace with the actual file path
batsman_df = pd.read_csv(batsman_data_path)

# Display the first few rows of each DataFrame
print("Primary Train Data:")
print(primary_train_df.head())

print("\nBatsman Data:")
print(batsman_df.head())


Primary Train Data:
   match id     team1  team1_id  \
0   9331181        Ba     11283   
1   8797060        Ed        20   
2   9433269        We     10576   
3   9587073  Ga An Ws     36084   
4   9516457     Pb Ks     30407   

                                    team1_roster_ids  team2  team2_id  \
0  9373356.0:7857520.0:4232164.0:4566540.0:329940...  Hl Ph     12634   
1  2089079.0:6139370.0:2076192.0:62432.0:2083409....  Wt Is        41   
2  3298427.0:2288789.0:7773338.0:3519011.0:368195...     Ne      8987   
3  8127230.0:4690328.0:4069666.0:7960847.0:469018...  Bs Rs     36070   
4  8127181.0:197658.0:4239038.0:2398346.0:5053082...  Gt Ts     48341   

                                    team2_roster_ids    winner  winner_id  \
0  3500958.0:4231751.0:2735081.0:2035102.0:369833...     Hl Ph      12634   
1  4690258.0:4069666.0:4230127.0:1942317.0:161392...        Ed         20   
2  4003390.0:1749075.0:1626526.0:4172447.0:551672...        We      10576   
3  3462080.0:2436405.0

Step 2: Data Cleaning

In [4]:
# Check for missing values in primary train data
print("Missing values in Primary Train Data:")
print(primary_train_df.isnull().sum())

# Check for missing values in batsman-level data
print("\nMissing values in Batsman Data:")
print(batsman_df.isnull().sum())

# Handling missing values (example: filling missing numeric values with mean)
# primary_train_df.fillna(primary_train_df.mean(), inplace=True)
# batsman_df.fillna(batsman_df.mean(), inplace=True)

# Convert date columns to datetime
primary_train_df['match_dt'] = pd.to_datetime(primary_train_df['match_dt'])
batsman_df['match_dt'] = pd.to_datetime(batsman_df['match_dt'])


Missing values in Primary Train Data:
match id                      0
team1                         0
team1_id                      0
team1_roster_ids              0
team2                         0
team2_id                      0
team2_roster_ids              0
winner                        0
winner_id                     0
toss winner                   0
toss decision                 0
venue                         0
city                          0
match_dt                      0
lighting                      0
series_name                   0
season                        0
ground_id                     0
team_count_50runs_last15      0
team_winp_last5               0
team1only_avg_runs_last15    21
team1_winp_team2_last15       0
ground_avg_runs_last15       53
dtype: int64

Missing values in Batsman Data:
match id                  0
batsman                   0
batsman_id                0
batsman_details           0
is_batsman_captain        0
is_batsman_keeper         0
inning      

Step 3: Feature Engineering

In [5]:
# Create additional features from batsman-level data
# Example: Calculate total runs, strike rate, and number of boundaries (fours and sixes) for each batsman in each match

# Group by match_id and batsman_id to calculate aggregated stats
batsman_aggregates = batsman_df.groupby(['match id', 'batsman_id']).agg(
    total_runs=('runs', 'sum'),
    total_balls_faced=('balls_faced', 'sum'),
    total_fours=('Fours', 'sum'),
    total_sixes=('Sixes', 'sum')
).reset_index()

# Calculate additional stats
batsman_aggregates['strike_rate'] = (batsman_aggregates['total_runs'] / batsman_aggregates['total_balls_faced']) * 100

# Merge these features into the primary train data
primary_train_df = primary_train_df.merge(batsman_aggregates, how='left', left_on='match id', right_on='match id')

# Display the merged DataFrame
print("\nMerged Primary Train Data:")
print(primary_train_df.head())



Merged Primary Train Data:
   match id team1  team1_id  \
0   9331181    Ba     11283   
1   9331181    Ba     11283   
2   9331181    Ba     11283   
3   9331181    Ba     11283   
4   9331181    Ba     11283   

                                    team1_roster_ids  team2  team2_id  \
0  9373356.0:7857520.0:4232164.0:4566540.0:329940...  Hl Ph     12634   
1  9373356.0:7857520.0:4232164.0:4566540.0:329940...  Hl Ph     12634   
2  9373356.0:7857520.0:4232164.0:4566540.0:329940...  Hl Ph     12634   
3  9373356.0:7857520.0:4232164.0:4566540.0:329940...  Hl Ph     12634   
4  9373356.0:7857520.0:4232164.0:4566540.0:329940...  Hl Ph     12634   

                                    team2_roster_ids winner  winner_id  \
0  3500958.0:4231751.0:2735081.0:2035102.0:369833...  Hl Ph      12634   
1  3500958.0:4231751.0:2735081.0:2035102.0:369833...  Hl Ph      12634   
2  3500958.0:4231751.0:2735081.0:2035102.0:369833...  Hl Ph      12634   
3  3500958.0:4231751.0:2735081.0:2035102.0:369833.

Step 4: Splitting the Data

In [6]:
from sklearn.model_selection import train_test_split

# Define the target variable
target = 'winner_id'

# Define features (dropping non-numeric and target columns for simplicity)
features = primary_train_df.drop(columns=['match id', 'team1', 'team2', 'winner', target])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, primary_train_df[target], test_size=0.2, random_state=42)

print("\nTraining Data Shape:", X_train.shape)
print("Validation Data Shape:", X_val.shape)



Training Data Shape: (11564, 24)
Validation Data Shape: (2891, 24)


Step 5: Model Training

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Train an XGBoost model
model = XGBClassifier()
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print("\nValidation Accuracy:", accuracy)
