In [1]:
from google.colab import drive
drive.mount('/content/gdrive')


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import percentileofscore
from tabulate import tabulate
from scipy.stats import norm
import seaborn as sns
from scipy.stats import ttest_ind

#CSV created for top scorer from the script Comparing Premier League table to top goal scorer
df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/combined_top_scorer.csv')
#CSV created for Historical Table from the script Comparing Premier League table to top goal scorer
df2 = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/combined_data.csv')

#Realized there were some previous issues in the joins. Therefore, making some other changes to ensure the different data frames are joined correctly
df.rename(columns={'Squad': 'Team', 'Season': 'Year'}, inplace=True)
df['Team'] = df['Team'].str.replace("West Ham United", "West Ham", case=False)
df2['Team'] = df2['Team'].str.replace("West Ham United", "West Ham", case=False)

#Ensure Columns are equal by stripping white spaces before I do the join
df['Team'] = df['Team'].str.strip()
df2['Team'] = df2['Team'].str.strip()

df['Year'] = df['Year'].str.strip()
df2['Year'] = df2['Year'].str.strip()






# Joining the Top Individual Scorer and the historical EPL Dataframe into one dataframe
selected_columns = [
    'Year', 'Position', 'Team', 'Points', 'GP', 'GD', 'GF',
    'GA', 'HGF', 'HGA', 'AGF', 'AGA', 'PPG', 'Relegation', 'Highest Scorer Goal Count'
]

result_df = pd.merge(df, df2, on=['Year', 'Team'], how='left')[selected_columns]




#Because in Prem History, there have been seasons with more than 38, I will level it out by standardizing each season as if 38 games were played
result_df['Goals For 38 Games'] = ((result_df['GF']/result_df['GP'])*38).round()
result_df['Goals Against 38 Games'] = ((result_df['GA']/result_df['GP'])*38).round()
result_df['Highest Scorer 38 Games'] = ((result_df['Highest Scorer Goal Count']/result_df['GP'])*38).round()
result_df['Goal Difference 38 Games'] = ((result_df['GD']/result_df['GP'])*38).round()
result_df['PPG 38 Games'] = result_df['PPG']*38

# Filter rows with a specific label indicating relegation
relegated_teams = result_df[result_df['Relegation'] == 'Relegated']

# Identify the team with the highest points for relegation each year. I am doing this to see who was in jeopardy of getting relegated
highest_relegation_points = relegated_teams.groupby('Year')['PPG 38 Games'].max().reset_index()

highest_relegation_points.rename(columns={'PPG 38 Games': 'Highest Relegated Teams Points'}, inplace=True)

results_df = pd.merge(result_df, highest_relegation_points, on=['Year'], how='left')

#Created a Conditions statement so if a team that survived is within 6 points of the highest relegated team, then they are in fear of relegation. Defining as 6 points due to them being within 2 wins of being relegated
conditions = [
    (results_df['Relegation'] == 'Relegated'),
    (results_df['Points'] <= results_df['Highest Relegated Teams Points'] + 6)
    # Add more conditions as needed
    ]

choices = ['Relegated', 'Close to being Relegated']  # Corresponding labels for each condition

results_df['Relegation'] = np.select(conditions, choices, default='Not Relegated')

#Adding Percentiles
no_relegation_df = results_df[results_df['Relegation'] != 'Relegated']

max_position_by_year = no_relegation_df.groupby('Year')['Position'].max()
top_quartile = no_relegation_df.groupby('Year')['Position'].max() * 1/4
middle_quartile = no_relegation_df.groupby('Year')['Position'].max() * 2/4
third_quartile = no_relegation_df.groupby('Year')['Position'].max() * 3/4

# Calculate the maximum position, top quartile, middle quartile, and third quartile. Another way to compare teams and why they did not get relegated
max_position_by_year = no_relegation_df.groupby('Year')['Position'].max()
top_quartile = no_relegation_df.groupby('Year')['Position'].max() * 1/4
middle_quartile = no_relegation_df.groupby('Year')['Position'].max() * 2/4
third_quartile = no_relegation_df.groupby('Year')['Position'].max() * 3/4

# Convert the results to DataFrames
max_position_df = max_position_by_year.reset_index(name='Max Position')
top_quartile_df = top_quartile.reset_index(name='Top Quartile')
middle_quartile_df = middle_quartile.reset_index(name='Middle Quartile')
third_quartile_df = third_quartile.reset_index(name='Third Quartile')

# Merge the DataFrames onto no_relegation_df
no_relegation_df = pd.merge(no_relegation_df, max_position_df, on='Year', how='left')
no_relegation_df = pd.merge(no_relegation_df, top_quartile_df, on='Year', how='left')
no_relegation_df = pd.merge(no_relegation_df, middle_quartile_df, on='Year', how='left')
no_relegation_df = pd.merge(no_relegation_df, third_quartile_df, on='Year', how='left')
#This decides if they are a top Quartile team, a Middle Quartile team, or a middle quartile team. Ill use this as another way to compare how teams survived
conditions2 = [
    (no_relegation_df['Position'] <= no_relegation_df['Top Quartile']),
    (no_relegation_df['Position'] <= no_relegation_df['Middle Quartile']),
    (no_relegation_df['Position'] <= no_relegation_df['Third Quartile'])



    ]

choices2 = ['NR Top Quartile', 'NR Middle Quartile','NR Third Quartile']  # Corresponding labels for each condition

no_relegation_df['Position Grouping'] = np.select(conditions2, choices2, default='NR Bottom Quartile')

selected_columns_results = [
    'Year', 'Team', 'Position Grouping'
]

no_relegation_df = no_relegation_df[selected_columns_results]

# Merge results_df with no_relegation_df
merged_df = pd.merge(results_df, no_relegation_df, on=['Year', 'Team'], how='left')

#Keeping the necessary columns need
selected_columns_results = [
    'Year', 'Position', 'Team', 'Points', 'GP', 'GD', 'GF',
    'GA', 'HGF', 'HGA', 'AGF', 'AGA', 'PPG', 'Relegation', 'Highest Scorer Goal Count',
     'Goals For 38 Games', 'Goals Against 38 Games','Goal Difference 38 Games',
    'Highest Scorer 38 Games', 'PPG 38 Games', 'Position Grouping','Highest Relegated Teams Points'
]

merged_df = merged_df[selected_columns_results]

# Fill NaN values in 'Position Grouping' with values from 'Relegation' column. Now I will have 5 groups that will allow me to compare teams in a different way
merged_df['Position Grouping'].fillna(merged_df['Relegation'], inplace=True)

conditions3 = [
    (merged_df['Position Grouping'] == 'NR Top Quartile'),
    (merged_df['Position Grouping'] == 'NR Middle Quartile'),
    (merged_df['Position Grouping'] == 'NR Third Quartile'),
    (merged_df['Position Grouping'] == 'NR Bottom Quartile')



    ]

choices3 = [1, 2,3,4]  # Corresponding labels for each condition

#I did this if I wanted to do a correlation between various statistics and their position grouping. Couldn't do it by keeping it as a word/string
merged_df['Position Grouping Number'] = np.select(conditions3, choices3, default=5)


merged_df.to_csv('/content/gdrive/My Drive/Colab Notebooks/combining_all_data.csv', index=False)

#I am finding the highest relegated team and will compare them to the close to relegated to get a better understanding how GF,GA,GD, and Top Goal Scorer matter
relegation_condition = (merged_df['PPG 38 Games'] == merged_df['Highest Relegated Teams Points']) & ((merged_df['Relegation'] == 'Relegated') | (merged_df['Relegation'] == 'Close to being Relegated'))
highest_relegated_teams_vs_close_to_being_relegated_df = merged_df[relegation_condition]






#Do you need the magical 40 Points in order to stay in the Premier League?

#First thing I need to do to understand this is get the average amount of points for the highest relegated team. I created this field but I need a unique list for Year and Highest Relegated Teams Points
survival = relegated_teams.groupby('Year')['PPG 38 Games'].max().reset_index()

# To figure out how many points would help you survive, I am rounding the points and then adding 1 point so we know how many points would have allowed you to survive
survival['PPG 38 Games'] = survival['PPG 38 Games'].round() + 1

# Plotting the histogram
plt.figure(figsize=(10, 6))
plt.hist(survival['PPG 38 Games'], bins=10, color='blue', alpha=0.7)
plt.title('Amount of Points Needed to Survive')
plt.xlabel('Points')
plt.ylabel('Frequency')
plt.grid(True)


# Calculate mean, 75th percentile, and maximum
mean_points = survival['PPG 38 Games'].mean().round()
percentile_75 = np.percentile(survival['PPG 38 Games'], 75)
max_points = survival['PPG 38 Games'].max()
percentile_40 = percentileofscore(survival['PPG 38 Games'], 40)
print(percentile_40)


# Add lines for mean, 75th percentile, and max
plt.axvline(mean_points, color='red', linestyle='dashed', linewidth=2, label='Mean')
plt.axvline(percentile_75, color='orange', linestyle='dashed', linewidth=2, label='75th Percentile')
plt.axvline(max_points, color='purple', linestyle='dashed', linewidth=2, label='Max')
plt.axvline(40, color='blue', linestyle='dashed', linewidth=2, label='84th Percentile')

# Set x-axis ticks at regular intervals
plt.xticks(np.arange(min(survival['PPG 38 Games']), max(survival['PPG 38 Games']) + 1, 1))

print(survival)

# Add legend
plt.legend()

# Calculate the percentile for 40 points
plt.show()


#I showed the mean to show on average how many points you need, then I showed 75th percentile, which is 39 points, to tell you when you would stay up with 39 points.
#The Golden Points Total of 40 Points would make you stay up 84% of the time. I would define comfortably as 75% of the time, which is 39 points. However, if you 100% of the time
#want to stay up, then 45 Points would be needed

#Question 2: Do you need to have someone score over 10 goals in order to stay in the premier league?

#So I can see in my results which part is answering what
print('\033[1mDo you need to have someone score over 10 goals in order to stay in the premier league?\033[0m')

# Boxplot to compare distributions
close_to_being_relegated_goals = merged_df[merged_df['Relegation'] == 'Close to being Relegated']['Highest Scorer 38 Games']
relegated_goals = merged_df[merged_df['Relegation'] == 'Relegated']['Highest Scorer 38 Games']
close_to_being_goals_bottom_quartile_not_relegated = merged_df[merged_df['Position Grouping'] == 'NR Bottom Quartile']['Highest Scorer 38 Games']

plt.figure(figsize=(10, 6))
sns.boxplot(x='Position Grouping', y='Highest Scorer 38 Games', data=merged_df)
plt.title('Distribution of Goals by Top Scorer Based on Position Grouping')
plt.show()

# t-test to check for significant differences
t_stat, p_value = ttest_ind(close_to_being_goals_bottom_quartile_not_relegated, relegated_goals)
print(f'T-test results: t-statistic = {t_stat}, p-value = {p_value}')

plt.figure(figsize=(10, 6))
sns.boxplot(x='Relegation', y='Highest Scorer Goal Count', data=merged_df)
plt.title('Distribution of Goals by Top Scorer for Relegated and Non-Relegated Teams')
plt.show()




# t-test to check for significant differences
t_stat, p_value = ttest_ind(close_to_being_relegated_goals, relegated_goals)


print(f'T-test results: t-statistic = {t_stat}, p-value = {p_value}')


#Boxplot comparing the highest Point Relegation team and the close to relegated teams for their highest scorer Goal Count
plt.figure(figsize=(10, 6))
sns.boxplot(x='Relegation', y='Highest Scorer 38 Games', data=highest_relegated_teams_vs_close_to_being_relegated_df)
plt.title('Distribution of Goals by Top Scorer for Highest Positioned Relegated Team')
plt.show()

max_relegated_goals = highest_relegated_teams_vs_close_to_being_relegated_df[highest_relegated_teams_vs_close_to_being_relegated_df['Relegation'] == 'Relegated']['Highest Scorer Goal Count']

# t-test to check for significant differences
t_stat, p_value = ttest_ind(close_to_being_relegated_goals, max_relegated_goals)
print(f'T-test results: t-statistic = {t_stat}, p-value = {p_value}')



#When comparing the Bottom Quartile position group with the Relegated teams, you can see it is statistically significant and when you compare teams close to relegation,
#you also get that it is significant with a t value close to 2 and a p value of .05.  Therefore, it seems that it is something that is statistically significant



#Question 3: Goals Against is more important than Goals For (will also look at goal difference)
#So I can see in my results which part is answering what
print('\033[1mGoals Against is more Important than Goals For\033[0m')

#Create dataframes for Goals for and comparing two things: Relegation vs. Close to being relegated AND bottom quartile with Close to being relegated

close_to_being_relegated_goals_for = merged_df[merged_df['Relegation'] == 'Close to being Relegated']['Goals For 38 Games']
relegated_goals_for = merged_df[merged_df['Relegation'] == 'Relegated']['Goals For 38 Games']
bottom_quartile_not_relegated_goals_for =  merged_df[merged_df['Position Grouping'] == 'NR Bottom Quartile']['Goals For 38 Games']

#Create dataframes for Goals against and comparing two things: Relegation vs. Close to being relegated AND bottom quartile with Close to being relegated

close_to_being_relegated_goals_against = merged_df[merged_df['Relegation'] == 'Close to being Relegated']['Goals Against 38 Games']
relegated_goals_against = merged_df[merged_df['Relegation'] == 'Relegated']['Goals Against 38 Games']
bottom_quartile_not_relegated_goals_against =  merged_df[merged_df['Position Grouping'] == 'NR Bottom Quartile']['Goals Against 38 Games']

#Create dataframes for Goal Difference and comparing two things: Relegation vs. Close to being relegated AND bottom quartile with Close to being relegated
close_to_being_relegated_goals_difference = merged_df[merged_df['Relegation'] == 'Close to being Relegated']['Goal Difference 38 Games']
relegated_goals_difference = merged_df[merged_df['Relegation'] == 'Relegated']['Goal Difference 38 Games']
bottom_quartile_not_relegated_goals_difference =  merged_df[merged_df['Position Grouping'] == 'NR Bottom Quartile']['Goal Difference 38 Games']

# Boxplot to compare Position Grouping with Goals Against 38 Games
plt.figure(figsize=(10, 6))
sns.boxplot(x='Position Grouping', y='Goals Against 38 Games', data=merged_df)
plt.title('Distribution of Goals Against based on Position Grouping')
plt.show()

# t test to see if goals against is significant when comparing the bottom quartile of Not Relegated (NR) Teams and Relegated Teams
t_stat, p_value = ttest_ind(bottom_quartile_not_relegated_goals_against, relegated_goals_against)
print(f'T-test results: t-statistic = {t_stat}, p-value = {p_value}')

# Boxplot to compare Relegated, Close to Relegation, and Not Relegated Teams when it comes to Goals Against
plt.figure(figsize=(10, 6))
sns.boxplot(x='Relegation', y='Goals Against 38 Games', data=merged_df)
plt.title('Distribution of Goals Against for Relegated, Close to Relegated, and Non-Relegated Teams')
plt.show()

# t test to see if goals against is significant when comparing the bottom quartile of Not Relegated (NR) Teams and Relegated Teams
t_stat, p_value = ttest_ind(close_to_being_relegated_goals_against, relegated_goals_against)
print(f'T-test results: t-statistic = {t_stat}, p-value = {p_value}')

#Boxplot comparing the highest Point Relegation teams and the close to relegated teams for their highest scorer Goals Against
plt.figure(figsize=(10, 6))
sns.boxplot(x='Relegation', y='Goals Against 38 Games', data=highest_relegated_teams_vs_close_to_being_relegated_df)
plt.title('Goals Against for Highest Positioned Relegated Team vs. Close to Relegated Teams')
plt.show()

max_relegated_ga = highest_relegated_teams_vs_close_to_being_relegated_df[highest_relegated_teams_vs_close_to_being_relegated_df['Relegation'] == 'Relegated']['Goals Against 38 Games']

# t-test to check for significant differences
t_stat, p_value = ttest_ind(close_to_being_relegated_goals_against, max_relegated_ga)
print(f'T-test results: t-statistic = {t_stat}, p-value = {p_value}')

# Boxplot to compare Position Grouping with Goals For 38 Games
plt.figure(figsize=(10, 6))
sns.boxplot(x='Position Grouping', y='Goals For 38 Games', data=merged_df)
plt.title('Distribution of Goals For based on Position Grouping')
plt.show()

# t test to see if goals for is significant when comparing the bottom quartile of Not Relegated (NR) Teams and Relegated Teams
t_stat, p_value = ttest_ind(bottom_quartile_not_relegated_goals_for, relegated_goals_for)
print(f'T-test results: t-statistic = {t_stat}, p-value = {p_value}')

# Boxplot to compare Relegated, Close to Relegation, and Not Relegated Teams when it comes to Goals For
plt.figure(figsize=(10, 6))
sns.boxplot(x='Relegation', y='Goals For 38 Games', data=merged_df)
plt.title('Distribution of Goals For for Relegated, Close to Relegated, and Non-Relegated Teams')
plt.show()

# t test to see if goals for is significant when comparing the bottom quartile of Not Relegated (NR) Teams and Relegated Teams
t_stat, p_value = ttest_ind(close_to_being_relegated_goals_for, relegated_goals_for)
print(f'T-test results: t-statistic = {t_stat}, p-value = {p_value}')

#Boxplot comparing the highest Point Relegation teams and the close to relegated teams for their highest scorer Goals Against
plt.figure(figsize=(10, 6))
sns.boxplot(x='Relegation', y='Goals For 38 Games', data=highest_relegated_teams_vs_close_to_being_relegated_df)
plt.title('Goals for Highest Positioned Relegated Team vs. Close to Relegated Teams')
plt.show()

max_relegated_gf = highest_relegated_teams_vs_close_to_being_relegated_df[highest_relegated_teams_vs_close_to_being_relegated_df['Relegation'] == 'Relegated']['Goals For 38 Games']

# t-test to check for significant differences
t_stat, p_value = ttest_ind(close_to_being_relegated_goals_for, max_relegated_gf)
print(f'T-test results: t-statistic = {t_stat}, p-value = {p_value}')

# Boxplot to compare Position Grouping with Goal Difference 38 Games
plt.figure(figsize=(10, 6))
sns.boxplot(x='Position Grouping', y='Goal Difference 38 Games', data=merged_df)
plt.title('Distribution of Goal Difference based on Position Grouping')
plt.show()

# t test to see if goals difference is significant when comparing the bottom quartile of Not Relegated (NR) Teams and Relegated Teams
t_stat, p_value = ttest_ind(bottom_quartile_not_relegated_goals_difference, relegated_goals_difference)
print(f'T-test results: t-statistic = {t_stat}, p-value = {p_value}')

# Boxplot to compare Relegated, Close to Relegation, and Not Relegated Teams when it comes to Goal Difference
plt.figure(figsize=(10, 6))
sns.boxplot(x='Relegation', y='Goal Difference 38 Games', data=merged_df)
plt.title('Distribution of Goals Difference for Relegated, Close to Relegated, and Non-Relegated Teams')
plt.show()

# t test to see if goal difference is significant when comparing the bottom quartile of Not Relegated (NR) Teams and Relegated Teams
t_stat, p_value = ttest_ind(close_to_being_relegated_goals_difference, relegated_goals_difference)
print(f'T-test results: t-statistic = {t_stat}, p-value = {p_value}')

#Boxplot comparing the highest Point Relegation teams and the close to relegated teams for their highest scorer Goals Against
plt.figure(figsize=(10, 6))
sns.boxplot(x='Relegation', y='Goal Difference 38 Games', data=highest_relegated_teams_vs_close_to_being_relegated_df)
plt.title('Goal Difference for Highest Positioned Relegated Team vs. Close to Relegated Teams')
plt.show()

max_relegated_gd = highest_relegated_teams_vs_close_to_being_relegated_df[highest_relegated_teams_vs_close_to_being_relegated_df['Relegation'] == 'Relegated']['Goal Difference 38 Games']

# t-test to check for significant differences
t_stat, p_value = ttest_ind(close_to_being_relegated_goals_difference, max_relegated_gd)
print(f'T-test results: t-statistic = {t_stat}, p-value = {p_value}')

# Filter for Relegated and Close to being Relegated
close_to_being_relegated = merged_df[merged_df['Relegation'] == 'Close to being Relegated']
relegated = merged_df[merged_df['Relegation'] == 'Relegated']

# Perform t-test for 'Goals For': Relegated vs. Close to being Relegated
t_stat_goals_for, p_value_goals_for = ttest_ind(close_to_being_relegated['Goals For 38 Games'], relegated['Goals For 38 Games'])

# Perform t-test for 'Goals Against': Relegated vs. Close to being Relegated
t_stat_goals_against, p_value_goals_against = ttest_ind(close_to_being_relegated['Goals Against 38 Games'], relegated['Goals Against 38 Games'])

# Perform t-test for 'Goal Difference': Relegated vs. Close to being Relegated
t_stat_goal_difference, p_value_goal_difference = ttest_ind(close_to_being_relegated['Goal Difference 38 Games'], relegated['Goal Difference 38 Games'])

# Perform t-test for 'Goal Difference': Relegated vs. Close to being Relegated
t_stat_top_goal_scorer, p_value_top_goal_scorer = ttest_ind(close_to_being_relegated['Highest Scorer 38 Games'], relegated['Highest Scorer 38 Games'])

# Create a DataFrame to store summary statistics: Relegated vs. Close to being Relegated
t_test_results = pd.DataFrame({
    'Metric': ['Goals For', 'Goals Against', 'Goal Difference','Highest Scorer 38 Games' ],
    'T-Test Value': [t_stat_goals_for, t_stat_goals_against, t_stat_goal_difference,t_stat_top_goal_scorer],
    'P-Value': [p_value_goals_for, p_value_goals_against, p_value_goal_difference,p_value_top_goal_scorer]
})

# Display the table
print(t_test_results)

max_relegated = highest_relegated_teams_vs_close_to_being_relegated_df[highest_relegated_teams_vs_close_to_being_relegated_df['Relegation'] == 'Relegated']

# Perform t-test for 'Goals For': Max Relegated vs. Close to being Relegated
t_stat_goals_for2, p_value_goals_for2 = ttest_ind(close_to_being_relegated['Goals For 38 Games'], max_relegated['Goals For 38 Games'])

# Perform t-test for 'Goals Against': Relegated vs. Close to being Relegated
t_stat_goals_against2, p_value_goals_against2 = ttest_ind(close_to_being_relegated['Goals Against 38 Games'], max_relegated['Goals Against 38 Games'])

# Perform t-test for 'Goal Difference': Relegated vs. Close to being Relegated
t_stat_goal_difference2, p_value_goal_difference2 = ttest_ind(close_to_being_relegated['Goal Difference 38 Games'], max_relegated['Goal Difference 38 Games'])

# Perform t-test for 'Goal Difference': Relegated vs. Close to being Relegated
t_stat_top_goal_scorer2, p_value_top_goal_scorer2 = ttest_ind(close_to_being_relegated['Highest Scorer 38 Games'], max_relegated['Highest Scorer 38 Games'])

# Create a DataFrame to store summary statistics: Relegated vs. Close to being Relegated
t_test_results = pd.DataFrame({
    'Metric': ['Goals For', 'Goals Against', 'Goal Difference','Highest Scorer 38 Games' ],
    'T-Test Value': [t_stat_goals_for2, t_stat_goals_against2, t_stat_goal_difference2,t_stat_top_goal_scorer2],
    'P-Value': [p_value_goals_for2, p_value_goals_against2, p_value_goal_difference2,p_value_top_goal_scorer2]
})

# Selecting relevant columns
columns_of_interest = ['Goals Against 38 Games', 'Goals For 38 Games', 'Goal Difference 38 Games', 'Relegation']
scatter_data = merged_df[columns_of_interest]

# Filter out 'Not Relegated'
scatter_data = scatter_data[scatter_data['Relegation'] != 'Not Relegated']

# Set the figure size
plt.figure(figsize=(18, 8))

# Create separate bar plots for 'Goals Against' and 'Goals For'
for i, column in enumerate(['Goals Against 38 Games', 'Goals For 38 Games']):
    plt.subplot(1, 2, i + 1)
    sns.barplot(x='Relegation', y=column, data=scatter_data, errorbar='sd', capsize=0.1, errwidth=1.5,
                palette={'Close to being Relegated': 'orange', 'Relegated': 'blue'})
    plt.xlabel('Relegation')
    plt.ylabel(column)
    plt.title(f'Confidence Intervals for {column}')

    # Set y-axis ticks covering the range from the minimum to maximum value with increments of 5
    min_value = scatter_data[column].min()
    max_value = scatter_data[column].max()
    plt.yticks(range(int(min_value), int(max_value) + 5, 5))

# Create a separate bar plot for 'Goal Difference'
plt.subplot(1, 2, 2)
column = 'Goal Difference 38 Games'
sns.barplot(x='Relegation', y=column, data=scatter_data, errorbar='sd', capsize=0.1, errwidth=1.5,
            palette={'Close to being Relegated': 'orange', 'Relegated': 'blue'})
plt.xlabel('Relegation')
plt.ylabel(column)
plt.title(f'Confidence Intervals for {column}')

# Set y-axis ticks covering the range from the minimum to maximum value with increments of 5
min_value = scatter_data[column].min()
max_value = scatter_data[column].max()
plt.yticks(range(int(min_value), int(max_value) + 5, 5))

# Adjust layout
plt.tight_layout()
plt.show()









#What this data shows is all 3 metrics are significant. Goal difference being the most significant. Meaning,





KeyboardInterrupt: 

Unnamed: 0,Squad,Top Team Scorer,Season,Highest Scorer Goal Count,Highest Goal Scorer
0,Manchester United,Eric Cantona - 18,1993-94,18,Eric Cantona
1,Blackburn,Alan Shearer - 31,1993-94,31,Alan Shearer
2,Newcastle United,Andy Cole - 34,1993-94,34,Andy Cole
3,Arsenal,Ian Wright - 23,1993-94,23,Ian Wright
4,Leeds United,Rod Wallace - 17,1993-94,17,Rod Wallace
