In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
hockey_df = pd.read_csv("/kaggle/input/elite-prospects-hockey-stats-player-data/player_dim.csv",
                        index_col=0, 
                        engine="python")

hockey_df.info()

hockey_df["PLAYER_NAME"] = hockey_df["FIRST_NAME"] + " " + hockey_df["LAST_NAME"]
hockey_df = hockey_df[["PLAYER_NAME", "HEIGHT_CM", "WEIGHT_KG"]]

hockey_df = hockey_df.dropna()


In [None]:
# Plot graph
plt.scatter("HEIGHT_CM", "WEIGHT_KG", data=hockey_df)
plt.xlabel("HEIGHT(cm)")
plt.ylabel("WEIGHT(kg)")
plt.title("Hockey : WEIGHT / HEIGHT")
plt.show()

In [None]:
# Remove a player weighing more that 800kgs and another players with a height of just over 1 meter
hockey_df = hockey_df.loc[(hockey_df["WEIGHT_KG"] < 800) & (hockey_df["HEIGHT_CM"] > 120)]

# Plot graph
plt.scatter("HEIGHT_CM", "WEIGHT_KG", data=hockey_df)
plt.xlabel("HEIGHT(cm)")
plt.ylabel("WEIGHT(kg)")
plt.title("Hockey : WEIGHT / HEIGHT")
plt.show()

In [None]:
# Set up data to calculate standard units, mean and standard deviation
mean_height_hockey = hockey_df["HEIGHT_CM"].mean()
hockey_df["HEIGHT_CM_distance_from_mean"] = hockey_df["HEIGHT_CM"] - mean_height_hockey
hockey_df["HEIGHT_VARIANCE"] = np.square(hockey_df["HEIGHT_CM_distance_from_mean"])

mean_weight_hockey = hockey_df["WEIGHT_KG"].mean()
hockey_df["WEIGHT_CM_distance_from_mean"] = hockey_df["WEIGHT_KG"] - mean_weight_hockey
hockey_df["WEIGHT_VARIANCE"] = np.square(hockey_df["WEIGHT_CM_distance_from_mean"])

hh_variance = hockey_df["HEIGHT_VARIANCE"].sum() / hockey_df["HEIGHT_VARIANCE"].count()
hh_standard_deviation = np.sqrt(hh_variance)

hw_variance = hockey_df["WEIGHT_VARIANCE"].sum() / hockey_df["WEIGHT_VARIANCE"].count()
hw_standard_deviation = np.sqrt(hw_variance)

print("Hockey data set results")
print("===============================")
print("Height")
print("Mean: ", round(mean_height_hockey, 2))
print("Variance: ", round(hh_variance, 2))
print("Standard deviation: ", round(hh_standard_deviation, 2))
print("===============================")
print("Weight")
print("Mean: ", round(mean_weight_hockey,2))
print("Variance: ", round(hw_variance,2))
print("Standard deviation: ", round(hw_standard_deviation,2))


In [None]:
# Calculate individual standar deviations

hockey_df["std_HEIGHT"] = (hockey_df["HEIGHT_CM"] - mean_height_hockey ) / hh_standard_deviation
hockey_df["std_WEIGHT"] = (hockey_df["WEIGHT_KG"] - mean_weight_hockey ) / hw_standard_deviation
hockey_df.head()

In [None]:
# Show normal distribution

# Hockey height standard deviation
hh_one_std = hockey_df["std_HEIGHT"].loc[(hockey_df["std_HEIGHT"] >= -1) &
                                                        (hockey_df["std_HEIGHT"] <= 1)]
hh_two_std = hockey_df["std_HEIGHT"].loc[(hockey_df["std_HEIGHT"] >= -2) &
                                                        (hockey_df["std_HEIGHT"] <= 2)]
hh_three_std = hockey_df["std_HEIGHT"].loc[(hockey_df["std_HEIGHT"] >= -3) &
                                                        (hockey_df["std_HEIGHT"] <= 3)]

hh_one_std_per = round(hh_one_std.count() / hockey_df["std_HEIGHT"].count(), 3) * 100
hh_two_std_per = round(hh_two_std.count() / hockey_df["std_HEIGHT"].count(), 2) * 100
hh_three_std_per = round(hh_three_std.count() / hockey_df["std_HEIGHT"].count(), 3) * 100

text1 = " of observations fall within "
text2 = " standard deviations of the mean "

print("Findings for hockey heights")
print(str(hh_one_std_per) + str("%") + text1 + "1" + text2)
print(str(hh_two_std_per) + str("%") + text1 + "2" + text2)
print(str(hh_three_std_per) + str("%") + text1 + "3" + text2)


# Hockey weight standard deviation
hw_one_std = hockey_df["std_WEIGHT"].loc[(hockey_df["std_WEIGHT"] >= -1) &
                                                        (hockey_df["std_WEIGHT"] <= 1)]
hw_two_std = hockey_df["std_WEIGHT"].loc[(hockey_df["std_WEIGHT"] >= -2) &
                                                        (hockey_df["std_WEIGHT"] <= 2)]
hw_three_std = hockey_df["std_WEIGHT"].loc[(hockey_df["std_WEIGHT"] >= -3) &
                                                        (hockey_df["std_WEIGHT"] <= 3)]

hw_one_std_per = round(hw_one_std.count() / hockey_df["std_WEIGHT"].count(), 3) * 100
hw_two_std_per = round(hw_two_std.count() / hockey_df["std_WEIGHT"].count(), 2) * 100
hw_three_std_per = round(hw_three_std.count() / hockey_df["std_WEIGHT"].count(), 3) * 100

print("\nFindings for hockey weights")
print(str(hw_one_std_per) + str("%") + text1 + "1" + text2)
print(str(hw_two_std_per) + str("%") + text1 + "2" + text2)
print(str(hw_three_std_per) + str("%") + text1 + "3" + text2)


In [None]:
# Plot the normal distribution

hockey_df.hist("std_HEIGHT")
plt.title("Hockey height normal distribution")
hockey_df.hist("std_WEIGHT")
plt.title("Hockey weight normal distribution")

plt.show()