# List of Functions for EDA

In [None]:
import pandas as pd 
import numpy as np
import os
import matplotlib.pyplot as plt

## Scatter Plot 

In [None]:
#the scatter_plot function takes in a dataframe and two features (column names as strings) as arguments, and outputs a scatter plot representing
#the relationship between these two features as well as the correlation between them. 

def scatter_plot(df, feature1, feature2):
    correlation = df[feature1].corr(df[feature2])
    
    plt.figure()
    plt.scatter(df[feature1], df[feature2], alpha = 0.1)
    plt.title(f'Scatter Plot of {feature1} vs {feature2}')
    plt.xlabel(feature1)
    plt.ylabel(feature2)
    
    plt.text(0.1, 0.9, f'Correlation: {correlation:.2f}', 
         transform=plt.gca().transAxes, fontsize=12, 
         bbox=dict(facecolor='white', alpha=0.5, edgecolor='black'))

    plt.show()

## Box Plots

In [None]:
#the grouped_boxplot functions allows us to construct a box plot which compares the distribution of some feature (argument 2) when grouping the 
#observations by some other feature (group_feature - argument 3). for example, we would use this function to constructa boxplot which allows 
#us to compare the distribution of xG when grouping the observations by the number of goals scored. this is useful because it allows us 
#to identify whether or not there is any significant relationship between the feature of interest and goals/assists or any other target variable

def grouped_boxplot (df, feature, group_feature):
    plt.figure()
    df.boxplot(column = feature, by = group_feature, grid = False, showmeans = True)
    plt.title(f'Box Plot of {feature} vs {group_feature}')
    plt.suptitle('')
    plt.xlabel(group_feature)
    plt.ylabel(feature)
    plt.show()

In [None]:
#category_boxplot is a slightly different function which groups observation based on whether or not group_feature is zero/non-zero. For example, we
#may group observations that have 0 goals together, and observations that have at least 1 goal together. This function also prints the mean/median
#of the feature of interest. The text_diff argument specifies vertical distance between the text which prints the median and the text which prints
#the mean (we may want to change this value if the text is overlapping)

def category_boxplot (df, feature, group_feature, text_diff = 2):
    #create figure
    plt.figure()
    
    #prep data by grouping into two categories
    data = [df[df[group_feature] == 0][feature],   # Goals == 0
        df[df[group_feature] != 0][feature]]   # Goals != 0
    
    #create boxplot
    plt.boxplot(data, labels = [f'0 {group_feature}', f'Non-zero {group_feature}'], showmeans = True)
    plt.title(f'Box Plot of {feature} vs {group_feature} Category')
    plt.xlabel(f'{group_feature} Category')
    plt.ylabel(feature)
    
    #print mean/median 
    for i in range(len(data)):
        # Median (the horizontal line in the box)
        median = np.median(data[i])
        plt.text(i + 1.2, median, f'Median: {median:.2f}', ha='left', va='bottom', color='blue')

        # Mean (the green triangle in the boxplot)
        mean = np.mean(data[i])
        plt.text(i + 1.2, mean + text_diff, f'Mean: {mean:.2f}', ha='left', va='top', color='green')

    plt.show()