In [12]:
import pandas as pd

# Load all provided datasets
players_df = pd.read_csv('dim_players.csv')
match_results = pd.read_csv('dm_match_result.csv')
batting_summary = pd.read_csv('fact_bating_summary.csv')
bowling_summary = pd.read_csv('fact_bowling_summary.csv')

# Display the first few rows of each dataset for understanding
players_head = players_df.head()
matches_head = match_results.head()
batting_head = batting_summary.head()
bowling_head = bowling_summary.head()

(players_head, matches_head, batting_head, bowling_head)


(                    name        team image    battingStyle  \
 0  Najmul Hossain Shanto  Bangladesh   NaN   Left hand Bat   
 1          Soumya Sarkar  Bangladesh   NaN   Left hand Bat   
 2             Litton Das  Bangladesh   NaN  Right hand Bat   
 3     Shakib Al Hasan(c)  Bangladesh   NaN   Left hand Bat   
 4           Afif Hossain  Bangladesh   NaN   Left hand Bat   
 
              bowlingStyle          playingRole  \
 0      Right arm Offbreak     Top order Batter   
 1   Right arm Medium fast  Middle order Batter   
 2                     NaN  Wicketkeeper Batter   
 3  Slow Left arm Orthodox           Allrounder   
 4      Right arm Offbreak           Allrounder   
 
                                          description  
 0  Nazmul Hossain Shanto emerged from an unusual ...  
 1  A rarity among Bangladesh allrounders, top-ord...  
 2  Liton Das is the first wicketkeeper-batsman in...  
 3  When the annals of Bangladesh cricket are sift...  
 4  Bangladesh left-hander Afif 

In [13]:
## Merge batting summary, bowling summary, and match results using 'match_id'
batting_with_results = pd.merge(batting_summary, match_results, on='match_id', how='inner')
bowling_with_results = pd.merge(bowling_summary, match_results, on='match_id', how='inner')

# Combine batting and bowling for overall player performance (joining on match, player names)
batting_bowling_df = pd.merge(batting_with_results, bowling_with_results,
                              left_on=['match', 'batsmanName'],
                              right_on=['match', 'bowlerName'],
                              how='outer')

# Display the shape and first rows of the combined dataset
batting_bowling_df.shape, batting_bowling_df.head()


((914, 36),
                     match  teamInnings  battingPos         batsmanName  \
 0  Afghanistan Vs England          NaN         NaN                 NaN   
 1  Afghanistan Vs England      England         2.0          Alex Hales   
 2  Afghanistan Vs England  Afghanistan         7.0  Azmatullah Omarzai   
 3  Afghanistan Vs England      England         4.0          Ben Stokes   
 4  Afghanistan Vs England          NaN         NaN                 NaN   
 
    runs_x  balls  4s_x  6s_x      SR out/not_out  ... 6s_y wides noBalls  \
 0     NaN    NaN   NaN   NaN     NaN         NaN  ...  1.0   0.0     0.0   
 1    19.0   20.0   0.0   1.0   95.00         out  ...  NaN   NaN     NaN   
 2     8.0    6.0   1.0   0.0  133.33         out  ...  0.0   3.0     0.0   
 3     2.0    4.0   0.0   0.0   50.00         out  ...  0.0   1.0     0.0   
 4     NaN    NaN   NaN   NaN     NaN         NaN  ...  1.0   1.0     0.0   
 
     match_id_y      team1_y  team2_y winner_y   margin_y ground_y  \
 0

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

In [15]:
# Merge batting summary and match results using 'match_id'
batting_with_results = pd.merge(batting_summary, match_results, on='match_id', how='inner')

# Aggregate team-level features: total runs, wickets, and economy
match_features = batting_with_results.groupby(['match', 'winner']).agg({
    'runs': 'sum',
    'batsmanName': 'count'  # Proxy for wickets
}).reset_index()

# Rename columns for clarity
match_features.rename(columns={'runs': 'total_runs', 'batsmanName': 'total_wickets'}, inplace=True)

# Encode the winner column as target
match_features['winner_encoded'] = match_features['winner'].astype('category').cat.codes

# Features and target
X = match_features[['total_runs', 'total_wickets']]
y = match_features['winner_encoded']

# Balance classes manually
class_counts = y.value_counts()
majority_class = class_counts.idxmax()

# Resample minority classes to match the majority class
resampled_data = []
for class_label in y.unique():
    class_data = match_features[y == class_label]
    if class_label == majority_class:
        resampled_data.append(class_data)
    else:
        resampled_data.append(resample(class_data, replace=True, n_samples=class_counts.max(), random_state=42))

# Combine resampled data
balanced_data = pd.concat(resampled_data)
X_balanced = balanced_data[['total_runs', 'total_wickets']]
y_balanced = balanced_data['winner_encoded']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

accuracy, report


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.75,
 '              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00         2\n           1       0.50      1.00      0.67         1\n           2       0.00      0.00      0.00         2\n           3       1.00      1.00      1.00         1\n           4       0.50      1.00      0.67         1\n           5       1.00      1.00      1.00         2\n           6       0.00      0.00      0.00         0\n           8       0.00      0.00      0.00         1\n          10       1.00      0.67      0.80         3\n          12       0.67      1.00      0.80         2\n          13       1.00      1.00      1.00         1\n\n    accuracy                           0.75        16\n   macro avg       0.61      0.70      0.63        16\nweighted avg       0.71      0.75      0.71        16\n')

In [16]:
# Merge bowling summary and match results using 'match_id'
bowling_with_results = pd.merge(bowling_summary, match_results, on='match_id', how='inner')

# Aggregate bowling features: total wickets, total runs conceded, and average economy rate
bowling_features = bowling_with_results.groupby(['match', 'winner']).agg({
    'wickets': 'sum',    # Total wickets taken
    'runs': 'sum',       # Total runs conceded
    'economy': 'mean'    # Average economy rate
}).reset_index()

# Rename columns for clarity
bowling_features.rename(columns={'wickets': 'total_wickets_taken',
                                 'runs': 'total_runs_conceded',
                                 'economy': 'average_economy'}, inplace=True)

# Merge batting features and bowling features
match_combined_features = pd.merge(match_features, bowling_features, on=['match', 'winner'], how='inner')

# Update features and target
X_combined = match_combined_features[['total_runs', 'total_wickets',
                                      'total_wickets_taken', 'total_runs_conceded', 'average_economy']]
y_combined = match_combined_features['winner_encoded']

# Balance classes
resampled_data = []
for class_label in y_combined.unique():
    class_data = match_combined_features[y_combined == class_label]
    if class_label == y_combined.value_counts().idxmax():
        resampled_data.append(class_data)
    else:
        resampled_data.append(resample(class_data, replace=True, n_samples=y_combined.value_counts().max(), random_state=42))

balanced_combined_data = pd.concat(resampled_data)

# Split into features and target
X_balanced_combined = balanced_combined_data[['total_runs', 'total_wickets',
                                              'total_wickets_taken', 'total_runs_conceded', 'average_economy']]
y_balanced_combined = balanced_combined_data['winner_encoded']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_balanced_combined, y_balanced_combined, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model_combined = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_combined.fit(X_train, y_train)

# Predictions and evaluation
y_pred_combined = rf_model_combined.predict(X_test)
accuracy_combined = accuracy_score(y_test, y_pred_combined)
report_combined = classification_report(y_test, y_pred_combined)

accuracy_combined, report_combined


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.75,
 '              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00         2\n           1       0.50      1.00      0.67         1\n           2       0.00      0.00      0.00         2\n           3       0.50      1.00      0.67         1\n           4       1.00      1.00      1.00         1\n           5       1.00      1.00      1.00         2\n           6       0.00      0.00      0.00         0\n           8       0.00      0.00      0.00         1\n          10       1.00      0.67      0.80         3\n          12       1.00      1.00      1.00         2\n          13       1.00      1.00      1.00         1\n          14       0.00      0.00      0.00         0\n\n    accuracy                           0.75        16\n   macro avg       0.58      0.64      0.59        16\nweighted avg       0.75      0.75      0.73        16\n')