In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

def load_and_preprocess_data(df):
    """
    Preprocess the dataset by cleaning height data and ensuring data consistency
    """
    # Create a copy to avoid modifying original data
    df = df.copy()
    
    # Replace height with random values between 150-180 cm
    np.random.seed(42)  # For reproducibility
    df['Height'] = np.random.uniform(150, 180, len(df))
    
    # Convert salary to numeric, removing any non-numeric characters
    df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')
    
    # Convert Age to numeric
    df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
    
    return df

def analyze_team_distribution(df):
    """
    Analyze team distribution and calculate percentage splits
    """
    team_dist = df['Team'].value_counts()
    team_percentages = (team_dist / len(df) * 100).round(2)
    
    # Create pie chart
    plt.figure(figsize=(12, 8))
    plt.pie(team_percentages[:10], labels=team_percentages[:10].index, autopct='%1.1f%%')
    plt.title('Top 10 Teams by Employee Distribution')
    plt.axis('equal')
    plt.show()
    
    return team_percentages

def analyze_positions(df):
    """
    Analyze position distribution and create visualization
    """
    pos_dist = df['Position'].value_counts()
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x=pos_dist.index, y=pos_dist.values)
    plt.title('Distribution of Positions')
    plt.xticks(rotation=45)
    plt.ylabel('Number of Players')
    plt.tight_layout()
    plt.show()
    
    return pos_dist

def analyze_age_distribution(df):
    """
    Analyze age distribution and create visualization
    """
    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x='Age', bins=20)
    plt.title('Age Distribution of Players')
    plt.xlabel('Age')
    plt.ylabel('Count')
    plt.show()
    
    age_stats = df['Age'].describe()
    return age_stats

def analyze_salary_by_team_position(df):
    """
    Analyze salary distribution by team and position
    """
    # Average salary by team
    team_salary = df.groupby('Team')['Salary'].mean().sort_values(ascending=False)
    
    # Average salary by position
    pos_salary = df.groupby('Position')['Salary'].mean().sort_values(ascending=False)
    
    # Visualize team salary
    plt.figure(figsize=(12, 6))
    sns.barplot(x=team_salary.index[:10], y=team_salary.values[:10])
    plt.title('Average Salary by Team (Top 10)')
    plt.xticks(rotation=45)
    plt.ylabel('Average Salary ($)')
    plt.tight_layout()
    plt.show()
    
    # Visualize position salary
    plt.figure(figsize=(8, 6))
    sns.barplot(x=pos_salary.index, y=pos_salary.values)
    plt.title('Average Salary by Position')
    plt.xticks(rotation=45)
    plt.ylabel('Average Salary ($)')
    plt.tight_layout()
    plt.show()
    
    return team_salary, pos_salary

def analyze_age_salary_correlation(df):
    """
    Analyze correlation between age and salary
    """
    # Calculate correlation
    correlation = df['Age'].corr(df['Salary'])
    
    # Create scatter plot
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df, x='Age', y='Salary')
    plt.title(f'Age vs Salary (Correlation: {correlation:.2f})')
    plt.xlabel('Age')
    plt.ylabel('Salary ($)')
    plt.show()
    
    return correlation

def generate_insights(df):
    """
    Generate key insights from the analysis
    """
    insights = {
        'total_players': len(df),
        'avg_age': df['Age'].mean(),
        'avg_salary': df['Salary'].mean(),
        'num_teams': df['Team'].nunique(),
        'num_positions': df['Position'].nunique(),
        'highest_paid_player': df.loc[df['Salary'].idxmax()],
        'youngest_player': df.loc[df['Age'].idxmin()],
        'oldest_player': df.loc[df['Age'].idxmax()]
    }
    
    return insights

# Main analysis function
def run_analysis(df):
    """
    Run complete analysis pipeline
    """
    # Preprocess data
    clean_df = load_and_preprocess_data(df)
    
    # Run analyses
    team_dist = analyze_team_distribution(clean_df)
    pos_dist = analyze_positions(clean_df)
    age_stats = analyze_age_distribution(clean_df)
    team_salary, pos_salary = analyze_salary_by_team_position(clean_df)
    age_salary_corr = analyze_age_salary_correlation(clean_df)
    insights = generate_insights(clean_df)
    
    return {
        'team_distribution': team_dist,
        'position_distribution': pos_dist,
        'age_statistics': age_stats,
        'team_salary': team_salary,
        'position_salary': pos_salary,
        'age_salary_correlation': age_salary_corr,
        'insights': insights
    }