In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Load data
df = pd.read_csv("3n.csv")
df = df.iloc[:,1:]
df.head()

Unnamed: 0,name,season,team,pos,g,gs,mp_per_g,fg_per_g,fga_per_g,fg_pct,...,recruiting_year,recruiting_rank,draft_year,draft_team,draft_round,draft_pick,draft_pick_overall,nba_debut,salary_org,salary_mod
0,A.J. Hammons,2016-17,Dallas Mavericks,C,22,0,7.4,0.8,1.9,0.405,...,2012.0,75.0,2016.0,Dallas Mavericks,2.0,16.0,46.0,28/10/2016,"$650,000",650000
1,A.J. Price,2009-10,Indiana Pacers,PG,56,2,15.4,2.6,6.3,0.41,...,2004.0,24.0,2009.0,Indiana Pacers,2.0,22.0,52.0,30/10/2009,"$457,588",457588
2,A.J. Price,2010-11,Indiana Pacers,PG,50,0,15.9,2.3,6.4,0.356,...,2004.0,24.0,2009.0,Indiana Pacers,2.0,22.0,52.0,30/10/2009,"$762,195",762195
3,A.J. Price,2011-12,Indiana Pacers,PG,44,1,12.9,1.3,4.0,0.339,...,2004.0,24.0,2009.0,Indiana Pacers,2.0,22.0,52.0,30/10/2009,"$854,389",854389
4,A.J. Price,2012-13,Washington Wizards,PG,57,22,22.4,2.8,7.2,0.39,...,2004.0,24.0,2009.0,Indiana Pacers,2.0,22.0,52.0,30/10/2009,"$885,120",885120


In [3]:
# 1

# Explain how you clean your dataset and transform your data variable 
# if any and provide a data dictionary for all your variables

# the dataset is cleaned and transformed, along with a data dictionary for the variables:

# 1. Handling Missing Values:
#   - Rows with missing values in the 'salary_org' column are removed 
# using `df = df.dropna(subset=['salary_org'])`.

# 2. Data Types:
#   - The 'dob' column is converted to datetime format 
# using `df['dob'] = pd.to_datetime(df['dob'], errors='coerce')` 
# to handle potential date format issues.
#   - Duplicates are removed with `df = df.drop_duplicates()`.

# 3. Data Transformation:
#   - Age is calculated by subtracting the birth year 
# from the current year: `current_year - df['dob'].dt.year`.
#   - The 'hand' column is categorized as a categorical variable 
# using `df['hand'] = df['hand'].astype('category')`.

# 4. Salary Transformation:
#   - Non-numeric characters (dollar sign and comma) 
# are removed from the 'salary_org' column 
# using `df['salary_org'] = df['salary_org'].replace('[\$,]', '', regex=True)`.
#   - Rows with empty 'salary_org' values are filtered out 
# with `df = df[df['salary_org'].str.isnumeric()]`.
#   - The 'salary_org' column is then converted to a float 
# using `df['salary_org'] = df['salary_org'].astype(float)`.

# 5. Derived Features:
#   - The 'age_vs2024' column is created by subtracting 
# the birth year from the year 2024: `2024 - df['dob'].dt.year`.

#6. Data Dictionary:

#   - name: Name of the NBA player.
#   - season: NBA season.
#   - team: NBA team.
#   - pos: Position played by the player.
#   - g: Games played.
#   - gs: Games started.
#   - ... (other columns)

#   - dob: Date of birth.
#   - age: Age of the player.
#   - hand: Dominant hand.
#  - height_cm: Height in centimeters.
#   - weight_kg: Weight in kilograms.
#   - pob_city: Place of birth (city).
#   - pob_statecountry: Place of birth (state/country).
#   - pob_country: Country of birth.
#   - ... (other columns)

#   - salary_org: Original salary.
#   - salary_mod: Modified salary.
#   - age_vsseason: Age vs. NBA season.
#   - age_vs2024: Age vs. the year 2024.
#   - ... (other columns)

# This cleaned and transformed dataset is saved as 'cleaned_3abc.csv'.

# below is the code


# Data Cleaning
# Handling Missing Values
df = df.dropna(subset=['salary_org'])  # Remove rows with missing values in 'salary_org'

# Data Types
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')  # Convert 'dob' to datetime, handling errors

# Remove Duplicates
df = df.drop_duplicates()

# Data Transformation
# Age Calculation
current_year = pd.to_datetime('now').year
df['age'] = current_year - df['dob'].dt.year

# Categorization (Example: Dominant Hand)
df['hand'] = df['hand'].astype('category')

# Salary Transformation
# Remove non-numeric characters from 'salary_org'
df['salary_org'] = df['salary_org'].replace('[\$,]', '', regex=True)

# Filter out rows with empty 'salary_org'
df = df[df['salary_org'].str.isnumeric()]

# Convert 'salary_org' to float
df['salary_org'] = df['salary_org'].astype(float)

# Create Derived Features (Example: Age at 2024)
df['age_vs2024'] = 2024 - df['dob'].dt.year

# Display the cleaned and transformed DataFrame
print(df.head())

           name   season                team pos   g  gs  mp_per_g  fg_per_g  \
0  A.J. Hammons  2016-17    Dallas Mavericks   C  22   0       7.4       0.8   
1    A.J. Price  2009-10      Indiana Pacers  PG  56   2      15.4       2.6   
2    A.J. Price  2010-11      Indiana Pacers  PG  50   0      15.9       2.3   
3    A.J. Price  2011-12      Indiana Pacers  PG  44   1      12.9       1.3   
4    A.J. Price  2012-13  Washington Wizards  PG  57  22      22.4       2.8   

   fga_per_g  fg_pct  ...  recruiting_rank  draft_year        draft_team  \
0        1.9   0.405  ...             75.0      2016.0  Dallas Mavericks   
1        6.3   0.410  ...             24.0      2009.0    Indiana Pacers   
2        6.4   0.356  ...             24.0      2009.0    Indiana Pacers   
3        4.0   0.339  ...             24.0      2009.0    Indiana Pacers   
4        7.2   0.390  ...             24.0      2009.0    Indiana Pacers   

   draft_round  draft_pick  draft_pick_overall   nba_debut  sa

  df['dob'] = pd.to_datetime(df['dob'], errors='coerce')  # Convert 'dob' to datetime, handling errors


In [4]:
# 2. For the most current season in the dataset, 
# a. how many active players are there? 
# b. how many players in each position?
# c. what is the average age, weight, experience, salary in the season? 
# d. what is average career salary?



# a. Number of active players for the most current season
current_season = df['season'].max()
active_players_count = df[df['season'] == current_season]['name'].nunique()

# b. Number of players in each position for the most current season
players_in_each_position = df[df['season'] == current_season]['pos'].value_counts()

# c. Average age, weight, experience, and salary in the season
average_age = df[df['season'] == current_season]['age_vs2024'].mean()
average_weight = df[df['season'] == current_season]['weight_kg'].mean()
average_experience = df[df['season'] == current_season]['age_vsseason'].mean()
average_salary = df[df['season'] == current_season]['salary_mod'].mean()

# d. Average career salary
average_career_salary = df['salary_mod'].mean()

# Displaying the results in a nicer format
print(f"a. Number of active players for the most current season: {active_players_count}\n")

print("b. Number of players in each position for the most current season:")
print(players_in_each_position.to_string(), "\n")

print(f"c. Average Age: {average_age:.2f} years")
print(f"   Average Weight: {average_weight:.2f} kg")
print(f"   Average Experience: {average_experience:.2f} years")
print(f"   Average Salary in the season: ${average_salary:,.2f}\n")

print(f"d. Average Career Salary: ${average_career_salary:,.2f}")

a. Number of active players for the most current season: 343

b. Number of players in each position for the most current season:
pos
C     80
SG    73
PF    70
PG    64
SF    63 

c. Average Age: 29.40 years
   Average Weight: 98.31 kg
   Average Experience: 27.46 years
   Average Salary in the season: $10,935,968.30

d. Average Career Salary: $5,450,529.74


In [5]:
# 3. More descriptive statistics on salaries: 
# a. how many players were active in each season? What is the average salary by season? How about variance of salary be season?
# b. who are the top 10% best paid players in the most current season? Which teams did these players play for?
# c. who are the bottom 10% best paid players in the most current season? Which teams did these players play for?
# d. who are the middle 50% by salary? Which teams did they play for?
# e. over the career of each of the active players in the most current season, how much money was paid to by season?



# a. Statistics by season
season_statistics = df.groupby('season')['name'].nunique().to_frame('active_players_count')
season_statistics['average_salary'] = df.groupby('season')['salary_mod'].mean()
season_statistics['variance_salary'] = df.groupby('season')['salary_mod'].var()

# b. Top 10% best-paid players in the most current season
current_season = df['season'].max()
top_10_percent = df[df['season'] == current_season].nlargest(int(df.shape[0] * 0.1), 'salary_mod')[['name', 'team', 'salary_mod']]

# c. Bottom 10% best-paid players in the most current season
bottom_10_percent = df[df['season'] == current_season].nsmallest(int(df.shape[0] * 0.1), 'salary_mod')[['name', 'team', 'salary_mod']]

# d. Middle 50% by salary in the most current season
try:
    middle_50_percent = df[df['season'] == current_season]['salary_mod'].quantile([0.25, 0.75])
    middle_50_percent_df = df[(df['season'] == current_season) & (df['salary_mod'] >= middle_50_percent.iloc[0]) & (df['salary_mod'] <= middle_50_percent.iloc[1])][['name', 'team', 'salary_mod']]
except Exception as e:
    print(f"Error: {e}")
    middle_50_percent_df = pd.DataFrame(columns=['name', 'team', 'salary_mod'])

# e. Total money paid to each player by season over their career
career_salary_by_season = df.groupby(['name', 'season'])['salary_mod'].sum().reset_index()

# Displaying the results
print("a. Statistics by season:")
print(season_statistics.to_string(), "\n")

print("b. Top 10% best-paid players in the most current season:")
print(top_10_percent.to_string(index=False), "\n")

print("c. Bottom 10% best-paid players in the most current season:")
print(bottom_10_percent.to_string(index=False), "\n")

print("d. Middle 50% by salary in the most current season:")
print(middle_50_percent_df.to_string(index=False), "\n")

print("e. Total money paid to each player by season over their career:")
print(career_salary_by_season.to_string(index=False))

a. Statistics by season:
         active_players_count  average_salary  variance_salary
season                                                        
1992-93                     1    3.000000e+06              NaN
1993-94                     2    2.401000e+06     4.494002e+12
1994-95                     5    2.560560e+06     2.146726e+12
1995-96                    13    2.206538e+06     2.110504e+12
1996-97                    22    2.592386e+06     7.501090e+12
1997-98                    32    2.541162e+06     7.899044e+12
1998-99                    45    3.139045e+06     1.331746e+13
1999-00                    66    3.368317e+06     1.515710e+13
2000-01                    83    3.810262e+06     1.802614e+13
2001-02                   116    3.515013e+06     1.845315e+13
2002-03                   138    3.784332e+06     2.047345e+13
2003-04                   174    3.951711e+06     2.072417e+13
2004-05                   211    4.159007e+06     1.821156e+13
2005-06                   255 

In [6]:
# 4 Team-player statistics: 
# a. what is the average salary of each team by season?
# b. what is the average age of the players by season? Average and variance of experience by season of each team?
# c. provide the information in b. in a “cross-tabulation” format, i.e. teams are on rows and seasons are on columns, and statistics are cell values.


import pandas as pd

# Assuming experience is stored in the 'age' column
# If experience is in a different column, replace 'age' accordingly

# Convert season to datetime format
df['season'] = pd.to_datetime(df['season'], format='%Y-%y')

# a. Average salary of each team by season
average_salary_by_team_season = df.groupby(['team', 'season'])['salary_mod'].mean().reset_index()

# b. Average age, average experience, and variance of experience by season of each team
team_season_stats = df.groupby(['team', 'season']).agg(
    average_age=('age', 'mean'),
    average_experience=('age', 'mean'),  # Assuming experience is stored in the 'age' column
    variance_experience=('age', 'var')   # Assuming experience is stored in the 'age' column
).reset_index()

# c. Cross-tabulation format
team_season_stats_crosstab = pd.pivot_table(team_season_stats, values=['average_age', 'average_experience', 'variance_experience'],
                                            index='team', columns='season', aggfunc='mean')

# Displaying the results
print("a. Average salary of each team by season:")
print(average_salary_by_team_season.to_string(index=False), "\n")

print("b. Average age, average experience, and variance of experience by season of each team:")
print(team_season_stats.to_string(index=False), "\n")

print("c. Cross-tabulation format:")
print(team_season_stats_crosstab)

a. Average salary of each team by season:
                  team     season   salary_mod
         Atlanta Hawks 1999-01-01 1.000000e+06
         Atlanta Hawks 2000-01-01 1.468920e+06
         Atlanta Hawks 2001-01-01 1.389540e+06
         Atlanta Hawks 2002-01-01 5.285776e+06
         Atlanta Hawks 2003-01-01 4.083584e+06
         Atlanta Hawks 2004-01-01 5.102133e+06
         Atlanta Hawks 2005-01-01 2.321616e+06
         Atlanta Hawks 2006-01-01 4.503624e+06
         Atlanta Hawks 2007-01-01 3.583993e+06
         Atlanta Hawks 2008-01-01 4.815845e+06
         Atlanta Hawks 2009-01-01 4.746116e+06
         Atlanta Hawks 2010-01-01 4.705974e+06
         Atlanta Hawks 2011-01-01 5.104989e+06
         Atlanta Hawks 2012-01-01 4.604370e+06
         Atlanta Hawks 2013-01-01 4.031636e+06
         Atlanta Hawks 2014-01-01 3.942367e+06
         Atlanta Hawks 2015-01-01 3.820522e+06
         Atlanta Hawks 2016-01-01 4.758542e+06
         Atlanta Hawks 2017-01-01 5.244755e+06
         Atlanta H

In [7]:
# 5. What other data from the website can you use to explain salary? 
# Produce a table of summary statistics for key variables you shall use in 
# your analysis in the following Part C. 
# Summary statistics should include at least sample average, standard deviation, min/max.


# To explain salary in the context of basketball players, 
# several key variables can be considered. 
# These variables may include both player-related features and team-related features. 
# Here are some  variables that can be used to explain salary:

# 1. Player-related Features:
#   - Position (pos): The player's position can influence salary, 
# as different positions may have different salary expectations.
#   - Performance Metrics (e.g., points per game, assists per game, rebounds per game): 
# Player statistics can impact salary, and high-performing players 
# may receive higher salaries.
#   - Player Age (age_vs2024): Younger and more experienced players may command 
# different salary levels.
#   - Height and Weight (height_cm, weight_kg): Physical attributes of players 
# may contribute to salary variations.
#   - Draft Information (draft_year, draft_round, draft_pick): Players who were 
# drafted higher or in earlier rounds may have higher salary expectations.

# 2. Team-related Features:
#   - Team Performance Metrics (e.g., win-loss record, playoff appearances): 
# The success of the team may influence the salaries of its players.
#   - Market Size and Location (pob_city, pob_statecountry, pob_country): 
# Players in larger markets or desirable locations may receive higher salaries.
#   - Team Salary Cap Status: 
# The team's salary cap situation can impact individual player salaries.

# 3. Contract-related Features:
#   - Contract Length: 
# The length of a player's contract can influence their annual salary.
#   - Contract Type (e.g., rookie contract, veteran contract): 
# Different types of contracts may have different salary structures.

# 4. Career-related Features:
#   - Years of Experience (experience): 
# Player experience in the league may affect salary negotiations.
#   - NBA Debut Date (nba_debut): 
# The date a player debuted in the NBA can be relevant to their salary trajectory.

# 5. Market Trends:
#   - League-wide Salary Trends: 
# Understanding salary trends across the league can provide context 
# for individual player salaries.

# 6. External Factors:
#   - Economic Conditions: The overall economic conditions and 
# the NBA's financial health can impact player salaries.

# To provide a table of summary statistics for these key variables, 
# use the `describe()` function in pandas for numerical variables 
# and use value counts for categorical variables. 


summary_stats = df[['salary_mod', 'pos', 'pts_per_g', 'age_vs2024', 'height_cm', 'weight_kg', 'draft_pick']].describe(include='all').transpose()
print(summary_stats)

             count unique  top  freq            mean             std     min  \
salary_mod  8600.0    NaN  NaN   NaN  5450529.743256  6666907.529737  4608.0   
pos           8600      5   SG  1799             NaN             NaN     NaN   
pts_per_g   8600.0    NaN  NaN   NaN        9.251919        6.286986     0.0   
age_vs2024  8598.0    NaN  NaN   NaN       37.346941        6.803119    23.0   
height_cm   8598.0    NaN  NaN   NaN      200.316702        8.963022   165.0   
weight_kg   8598.0    NaN  NaN   NaN       99.808095        12.18777    61.0   
draft_pick  7212.0    NaN  NaN   NaN       12.950776        8.334312     1.0   

                  25%        50%        75%         max  
salary_mod  1213560.0  2796520.0  7142200.0  48070014.0  
pos               NaN        NaN        NaN         NaN  
pts_per_g         4.4        7.9       12.8        36.1  
age_vs2024       32.0       37.0       43.0        54.0  
height_cm       193.0      201.0      208.0       229.0  
weight_kg  