# List of Functions for EDA

In [None]:
import pandas as pd 
import numpy as np
import os
import matplotlib.pyplot as plt

## Scatter Plot 

In [None]:
#the scatter_plot function takes in a dataframe and two features (column names as strings) as arguments, and outputs a scatter plot representing
#the relationship between these two features as well as the correlation between them. 

def scatter_plot(df, feature1, feature2):
    correlation = df[feature1].corr(df[feature2])
    
    plt.figure()
    plt.scatter(df[feature1], df[feature2], alpha = 0.1)
    plt.title(f'Scatter Plot of {feature1} vs {feature2}')
    plt.xlabel(feature1)
    plt.ylabel(feature2)
    
    plt.text(0.1, 0.9, f'Correlation: {correlation:.2f}', 
         transform=plt.gca().transAxes, fontsize=12, 
         bbox=dict(facecolor='white', alpha=0.5, edgecolor='black'))

    plt.show()

## Box Plots

In [None]:
#the grouped_boxplot functions allows us to construct a box plot which compares the distribution of some feature (argument 2) when grouping the 
#observations by some other feature (group_feature - argument 3). for example, we would use this function to constructa boxplot which allows 
#us to compare the distribution of xG when grouping the observations by the number of goals scored. this is useful because it allows us 
#to identify whether or not there is any significant relationship between the feature of interest and goals/assists or any other target variable

def grouped_boxplot (df, feature, group_feature):
    plt.figure()
    df.boxplot(column = feature, by = group_feature, grid = False, showmeans = True)
    plt.title(f'Box Plot of {feature} vs {group_feature}')
    plt.suptitle('')
    plt.xlabel(group_feature)
    plt.ylabel(feature)
    plt.show()

In [None]:
#category_boxplot is a slightly different function which groups observation based on whether or not group_feature is zero/non-zero. For example, we
#may group observations that have 0 goals together, and observations that have at least 1 goal together. This function also prints the mean/median
#of the feature of interest. The text_diff argument specifies vertical distance between the text which prints the median and the text which prints
#the mean (we may want to change this value if the text is overlapping)

def category_boxplot (df, feature, group_feature, text_diff = 2, dropna = False):
    #create figure
    plt.figure()
    
    #prep data by grouping into two categories
    if dropna == False:
        data = [df[df[group_feature] == 0][feature],   # Goals == 0
            df[df[group_feature] != 0][feature]]   # Goals != 0
    else: 
        data = [df[df[group_feature] == 0][feature].dropna(),   # Goals == 0
            df[df[group_feature] != 0][feature].dropna()]   # Goals != 0
    
    #create boxplot
    plt.boxplot(data, labels = [f'0 {group_feature}', f'Non-zero {group_feature}'], showmeans = True)
    plt.title(f'Box Plot of {feature} vs {group_feature} Category')
    plt.xlabel(f'{group_feature} Category')
    plt.ylabel(feature)
    
    #print mean/median 
    for i in range(len(data)):
        # Median (the horizontal line in the box)
        median = np.median(data[i])
        plt.text(i + 1.2, median, f'Median: {median:.2f}', ha='left', va='bottom', color='blue')

        # Mean (the green triangle in the boxplot)
        mean = np.mean(data[i])
        plt.text(i + 1.2, mean + text_diff, f'Mean: {mean:.2f}', ha='left', va='top', color='green')

    plt.show()

## Frequency Plots (Histograms)

In [None]:
#freq_plot allows us to construct a histogram/bar plot, depending on the frequency of the variable we wish to visualise. If we wish to plot the 
#frequency of a feature in a dataframe, then we need to specify the dataframe that the feature comes from (df argument). If not (i.e. if we want
# to visualise the frequency of a series of values, then ignore the df argument, as this is initialised to a default value of 'None'). In the case
#that we are constructing a histogram/bar plot of a series of values, then we need to specify the plot_title argument (usually this will just be 
# the name of series of values e.g. Carries per 90)

def freq_plot(feature, bin_number, plot_title = None, df = None, font_size = 7, text_rotate = 0):
    if df is not None:
        plt.figure()
        plt.hist(df[feature], bins = bin_number, width = 0.8, align = 'mid')
        plt.title(f'Histogram of {feature} Values')
        plt.xlabel(feature)
        plt.ylabel('Frequency')
        plt.xticks(rotation = text_rotate, fontsize = font_size)
        plt.show()
    if plot_title is not None: 
        plt.figure()
        plt.hist(feature, bins = bin_number)
        plt.title(f'Histogram of {plot_title}')
        plt.xlabel(plot_title)
        plt.ylabel('Frequency')
        plt.xticks(rotation = text_rotate, fontsize = font_size)
        plt.show()

In [5]:
#discrete_freq_plot should be used instead of freq_plot if we want to visualise the frequency of a variable that has discrete values (i.e. we don't
# need to bin the values). the xticks argument controls whether or not the x-axis labels need to spaced out. if the discrete values in the variable 
# are quite limited (i.e. they only range from 0 to 10), then we don't need to mess with the x-axis lables. However, if the x-axis labels span quite
# a large range (i.e. the Touches feature has min value 0 and max value 193), if we just leave it alone, then the x-axis labels will be bunched up. 
#Therefore, we need to set this argument to True to make sure that the x-axis labels have intervals between them. The diff argument controls how far 
#between each x-axis label (if setting diff = 5, then we will have x-axis labels going from 0 to 5 to 10...)

def discrete_freq_plot(feature, df, font_size = 7, text_rotate = 0, xticks = False, diff = 1):
    #get the value counts
    counts = df[feature].value_counts().sort_index()
    
    fig, ax = plt.subplots()
    counts.plot(kind = 'bar', alpha = 0.7)
    ax.set_xlabel(feature)
    ax.set_ylabel('Frequency')
    ax.set_title(f'Frequency of {feature}')
    
    if xticks == True:
        range_min = df[feature].min()
        range_max = df[feature].max()
        ax.set_xticks(range(range_min, range_max + diff, diff))
        ax.set_xticklabels(range(range_min, range_max + diff, diff))
    
    plt.xticks(rotation = text_rotate, fontsize = font_size)
    plt.show()
