In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind

# Load datasets
batting_df = pd.read_csv("fact_batting_summary.csv")
bowling_df = pd.read_csv("fact_bowling_summary.csv")
players_df = pd.read_csv("dim_players.csv")
matches_df = pd.read_csv("dim_match_summary.csv")


# Data Cleaning
# Removing players with less than 3 innings
batting_counts = batting_df['batsmanName'].value_counts()
batting_df = batting_df[batting_df['batsmanName'].isin(batting_counts[batting_counts >= 3].index)]

bowling_counts = bowling_df['bowlerName'].value_counts()
bowling_df = bowling_df[bowling_df['bowlerName'].isin(bowling_counts[bowling_counts >= 3].index)]

# Exploratory Data Analysis (EDA)
plt.figure(figsize=(10,5))
sns.histplot(batting_df['SR'], bins=30, kde=True)
plt.title("Distribution of Strike Rates")
plt.show()

plt.figure(figsize=(10,5))
sns.histplot(bowling_df['economy'], bins=30, kde=True)
plt.title("Distribution of Economy Rates")
plt.show()

# Hypothesis Testing: Compare Openers vs. Other Batsmen based on Strike Rate
openers = batting_df[batting_df['battingPos'] <= 2]['SR']
other_batsmen = batting_df[batting_df['battingPos'] > 2]['SR']
t_stat, p_val = ttest_ind(openers, other_batsmen)
print(f"T-Test Results: t-statistic={t_stat}, p-value={p_val}")

# Compare Bowlersâ€™ Economy Rate Between Teams
team1_bowlers = bowling_df[bowling_df['bowlingTeam'] == 'Namibia']['economy']
team2_bowlers = bowling_df[bowling_df['bowlingTeam'] == 'Sri Lanka']['economy']
t_stat, p_val = ttest_ind(team1_bowlers, team2_bowlers)
print(f"Bowling Economy Rate Comparison: t-statistic={t_stat}, p-value={p_val}")

# Visualization - Correlation Heatmap
plt.figure(figsize=(8,6))
sns.heatmap(batting_df[['runs', 'balls', '4s', '6s', 'SR']].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap - Batting")
plt.show()
