In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy.integrate import quad
from scipy.interpolate import interp1d

In [13]:
class NBADataAnalysis:

    def __init__(self, file_path):
        # Loads dataset
        self.file_path = file_path
        self.data = pd.read_csv(file_path)

        # Ensure numeric columns are numeric
        numeric_columns = ['FGM', 'FGA', '3PM', '3PA']
        for col in numeric_columns:
            self.data[col] = pd.to_numeric(self.data[col], errors='coerce')

        self.regular_season_data = None
        self.most_seasons_player = None
        self.player_data = None


    def filter_regular_season(self):
        # Filters only Regular Season data
        self.regular_season_data = self.data[
            self.data['Stage'] == 'Regular_Season'
        ]

        if len(self.regular_season_data) == 0:
            raise ValueError("No Regular_Season data found. Check Stage values.")

        return self.regular_season_data


    def player_most_seasons(self):
        # Finds player with most regular seasons played
        season_counts = self.regular_season_data.groupby('Player')['Season'].nunique()

        if len(season_counts) == 0:
            raise ValueError("No season data available after filtering.")

        self.most_seasons_player = season_counts.idxmax()
        return self.most_seasons_player


    def get_player_data(self):
        # Gets all regular season data for that player
        self.player_data = self.regular_season_data[
            self.regular_season_data['Player'] == self.most_seasons_player
        ].sort_values('Season')

        return self.player_data


    def calculate_three_point_accuracy(self):
        # Avoid division by zero
        self.player_data = self.player_data[self.player_data['3PA'] > 0]

        self.player_data['ThreePointAccuracy'] = (
            self.player_data['3PM'] / self.player_data['3PA']
        )

        return self.player_data[['Season', 'ThreePointAccuracy']]


    def linear_regression_accuracy(self):
        # Extract clean start year from "1999 - 2000"
        seasons = self.player_data['Season'].astype(str)
        years = [int(s.split('-')[0].strip()) for s in seasons]

        accuracy = self.player_data['ThreePointAccuracy']

        # Remove NaN just in case
        clean_data = pd.DataFrame({'Year': years, 'Accuracy': accuracy}).dropna()

        years_clean = clean_data['Year']
        accuracy_clean = clean_data['Accuracy']

        slope, intercept, r_value, p_value, std_err = stats.linregress(
            years_clean, accuracy_clean
        )

        return list(years_clean), slope, intercept, r_value, p_value


    def integrate_average_accuracy(self, years, slope, intercept):
        # Integrates regression line

        def regression_line(x):
            return slope * x + intercept

        earliest = min(years)
        latest = max(years)

        integral, error = quad(regression_line, earliest, latest)

        average_fit_accuracy = integral / (latest - earliest)

        actual_average = np.mean(self.player_data['ThreePointAccuracy'])

        return average_fit_accuracy, actual_average


    def interpolate_missing_seasons(self):
        # Interpolates 2002-2003 and 2015-2016

        seasons = self.player_data['Season'].astype(str)
        years = [int(s.split('-')[0].strip()) for s in seasons]
        accuracy = self.player_data['ThreePointAccuracy']

        clean_data = pd.DataFrame({'Year': years, 'Accuracy': accuracy}).dropna()

        interpolation_function = interp1d(
            clean_data['Year'],
            clean_data['Accuracy'],
            kind='linear',
            fill_value="extrapolate"
        )

        missing_years = [2002, 2015]

        estimates = {}
        for year in missing_years:
            estimates[year] = float(interpolation_function(year))

        return estimates


    def descriptive_statistics(self):
        # Calculates mean, variance, skew, kurtosis

        fgm = self.regular_season_data['FGM'].dropna()
        fga = self.regular_season_data['FGA'].dropna()

        results = {
            "FGM": {
                "mean": np.mean(fgm),
                "variance": np.var(fgm),
                "skew": stats.skew(fgm),
                "kurtosis": stats.kurtosis(fgm)
            },
            "FGA": {
                "mean": np.mean(fga),
                "variance": np.var(fga),
                "skew": stats.skew(fga),
                "kurtosis": stats.kurtosis(fga)
            }
        }

        return results


    def perform_t_tests(self):
        # Paired and independent t-tests

        fgm = self.regular_season_data['FGM'].dropna()
        fga = self.regular_season_data['FGA'].dropna()

        paired_test = stats.ttest_rel(fgm, fga)
        independent_test = stats.ttest_ind(fgm, fga)

        return paired_test, independent_test

In [15]:
file_path = "players_stats_by_season_full_details.csv"

analysis = NBADataAnalysis(file_path)

analysis.filter_regular_season()

player = analysis.player_most_seasons()
print("Player with most seasons:", player)

analysis.get_player_data()

accuracy_data = analysis.calculate_three_point_accuracy()
print(accuracy_data)

years, slope, intercept, r_value, p_value = analysis.linear_regression_accuracy()

avg_fit, actual_avg = analysis.integrate_average_accuracy(years, slope, intercept)

print("Average accuracy from regression integration:", avg_fit)
print("Actual average accuracy:", actual_avg)

print("Line of best fit:")
print("Accuracy =", slope, "* Year +", intercept)
print("R-value:", r_value)
print("P-value:", p_value)

missing_estimates = analysis.interpolate_missing_seasons()
print("Interpolated missing season accuracies:", missing_estimates)

stats_results = analysis.descriptive_statistics()
print("Descriptive Statistics:", stats_results)

paired_test, independent_test = analysis.perform_t_tests()

print("Paired t-test:", paired_test)
print("Independent t-test:", independent_test)

Player with most seasons: Vince Carter
            Season  ThreePointAccuracy
1      1999 - 2000            0.402542
509    2000 - 2001            0.408060
1047   2001 - 2002            0.386581
2761   2003 - 2004            0.382716
4114   2004 - 2005            0.405751
5333   2005 - 2006            0.340599
6548   2006 - 2007            0.356979
7961   2007 - 2008            0.358974
9267   2008 - 2009            0.385204
10608  2009 - 2010            0.367284
11920  2010 - 2011            0.361371
14813  2011 - 2012            0.360976
18772  2012 - 2013            0.406015
22921  2013 - 2014            0.393531
27000  2014 - 2015            0.297414
35443  2016 - 2017            0.378378
40190  2017 - 2018            0.345455
45213  2018 - 2019            0.389241
51949  2019 - 2020            0.301980
Average accuracy from regression integration: 0.3700826101829928
Actual average accuracy: 0.3699501146202911
Line of best fit:
Accuracy = -0.0025174156913259226 * Year + 5.427570734

In [17]:
#Average Three-Point Accuracy (Integration vs Actual Average):
#Due to there being an extremely small difference between the two values, The linear regression model fits the data well overall.
#The trend line provides a very accurate estimate of Vince Carterâ€™s average three-point accuracy.
#The regression model is a good representation of his long-term shooting performance.

#FGM Statistics Compared to FGA:
#FGA has higher mean and variance.
#Both have similar shape (skew and kurtosis).
#Attempts vary more than makes.

#Relational (Paired) T-Test vs Independent T-Test:
#Both tests confirm FGM and FGA are significantly different.
#The paired t-test is more appropriate.
#The paired test shows stronger evidence due to accounting for season-by-season pairing.