# NBA Points Predictor Machine Learning Model 
The Goal of this project is predict the point per game of nba players 

In [1]:
from IPython.display import display, Math, Latex
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, date
import random
import os
import requests
from bs4 import BeautifulSoup

In [2]:
#read in the players data
players_df = pd.read_csv("players.csv")

In [3]:
#read in the nba teams record df 
teams_record_df = pd.read_csv('teams_record.csv')

## Data Cleaning 

Merge and clean the players and the teams data in order to make one clean dataframe

#### Players DF Data Cleaning 

In [4]:
#set it to see all the columns 
pd.set_option('display.max_columns', None)

In [5]:
#read the players_df
players_df.head()

Unnamed: 0.1,Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,0,1,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,0.474,0.0,0.0,,1.3,2.7,0.474,0.474,0.6,1.0,0.568,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991
1,1,2,Mahmoud Abdul-Rauf,PG,21,DEN,67,19,22.5,6.2,15.1,0.413,0.4,1.5,0.24,5.9,13.6,0.432,0.425,1.3,1.5,0.857,0.5,1.3,1.8,3.1,0.8,0.1,1.6,2.2,14.1,1991
2,2,3,Mark Acres,C,28,ORL,68,0,19.3,1.6,3.1,0.509,0.0,0.0,0.333,1.6,3.1,0.512,0.512,1.0,1.5,0.653,2.1,3.2,5.3,0.4,0.4,0.4,0.6,3.2,4.2,1991
3,3,4,Michael Adams,PG,28,DEN,66,66,35.5,8.5,21.5,0.394,2.5,8.5,0.296,6.0,13.0,0.459,0.453,7.0,8.0,0.879,0.9,3.0,3.9,10.5,2.2,0.1,3.6,2.5,26.5,1991
4,4,5,Mark Aguirre,SF,31,DET,78,13,25.7,5.4,11.7,0.462,0.3,1.0,0.308,5.1,10.7,0.477,0.475,3.1,4.1,0.757,1.7,3.1,4.8,1.8,0.6,0.3,1.6,2.7,14.2,1991


In [6]:
#delete the unessary columns
del players_df['Unnamed: 0']
del players_df['Rk']

In [7]:
#remove rows where the player name is Player
players_df = players_df[players_df['Player'] != "Player"]

In [8]:
#remove the "*" from the Player columns
players_df['Player'] = players_df['Player'].str.rstrip('*')

In [9]:
#read the players_df
players_df.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,0.474,0.0,0.0,,1.3,2.7,0.474,0.474,0.6,1.0,0.568,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991
1,Mahmoud Abdul-Rauf,PG,21,DEN,67,19,22.5,6.2,15.1,0.413,0.4,1.5,0.24,5.9,13.6,0.432,0.425,1.3,1.5,0.857,0.5,1.3,1.8,3.1,0.8,0.1,1.6,2.2,14.1,1991
2,Mark Acres,C,28,ORL,68,0,19.3,1.6,3.1,0.509,0.0,0.0,0.333,1.6,3.1,0.512,0.512,1.0,1.5,0.653,2.1,3.2,5.3,0.4,0.4,0.4,0.6,3.2,4.2,1991
3,Michael Adams,PG,28,DEN,66,66,35.5,8.5,21.5,0.394,2.5,8.5,0.296,6.0,13.0,0.459,0.453,7.0,8.0,0.879,0.9,3.0,3.9,10.5,2.2,0.1,3.6,2.5,26.5,1991
4,Mark Aguirre,SF,31,DET,78,13,25.7,5.4,11.7,0.462,0.3,1.0,0.308,5.1,10.7,0.477,0.475,3.1,4.1,0.757,1.7,3.1,4.8,1.8,0.6,0.3,1.6,2.7,14.2,1991


In [10]:
#cleans data for players who played on more than one team in a single season 
for name, group in players_df.groupby(['Player', 'Year']):
    if len(group) > 1:
        # player played for more than one team in this year
        tot_row = group[group['Tm'] == 'TOT']
        if tot_row.empty:
            # tot_row is empty, skip this group
            continue
        tot_index = tot_row.index[0]
        # only drop rows if they exist in the dataframe
        if tot_index in players_df.index:
            # check if indexes to drop are in the dataframe
            drop_indexes = group.index[group.index != tot_index]
            if all(index in players_df.index for index in drop_indexes):
                players_df.drop(drop_indexes, inplace=True)
        # change TOT to player's last team
        last_row = group.iloc[-1]
        players_df.loc[tot_index, 'Tm'] = last_row['Tm']

In [11]:
#read the players_df
players_df.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,0.474,0.0,0.0,,1.3,2.7,0.474,0.474,0.6,1.0,0.568,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991
1,Mahmoud Abdul-Rauf,PG,21,DEN,67,19,22.5,6.2,15.1,0.413,0.4,1.5,0.24,5.9,13.6,0.432,0.425,1.3,1.5,0.857,0.5,1.3,1.8,3.1,0.8,0.1,1.6,2.2,14.1,1991
2,Mark Acres,C,28,ORL,68,0,19.3,1.6,3.1,0.509,0.0,0.0,0.333,1.6,3.1,0.512,0.512,1.0,1.5,0.653,2.1,3.2,5.3,0.4,0.4,0.4,0.6,3.2,4.2,1991
3,Michael Adams,PG,28,DEN,66,66,35.5,8.5,21.5,0.394,2.5,8.5,0.296,6.0,13.0,0.459,0.453,7.0,8.0,0.879,0.9,3.0,3.9,10.5,2.2,0.1,3.6,2.5,26.5,1991
4,Mark Aguirre,SF,31,DET,78,13,25.7,5.4,11.7,0.462,0.3,1.0,0.308,5.1,10.7,0.477,0.475,3.1,4.1,0.757,1.7,3.1,4.8,1.8,0.6,0.3,1.6,2.7,14.2,1991


In [12]:
#check null values 
for column in players_df.columns:
    
    print("Column:", column)
    
    # Print count of null values for the column
    print("Null values:", players_df[column].isnull().sum())
    
    print("\n" + "="*50 + "\n")

Column: Player
Null values: 0


Column: Pos
Null values: 0


Column: Age
Null values: 0


Column: Tm
Null values: 0


Column: G
Null values: 0


Column: GS
Null values: 0


Column: MP
Null values: 0


Column: FG
Null values: 0


Column: FGA
Null values: 0


Column: FG%
Null values: 61


Column: 3P
Null values: 0


Column: 3PA
Null values: 0


Column: 3P%
Null values: 2105


Column: 2P
Null values: 0


Column: 2PA
Null values: 0


Column: 2P%
Null values: 105


Column: eFG%
Null values: 61


Column: FT
Null values: 0


Column: FTA
Null values: 0


Column: FT%
Null values: 547


Column: ORB
Null values: 0


Column: DRB
Null values: 0


Column: TRB
Null values: 0


Column: AST
Null values: 0


Column: STL
Null values: 0


Column: BLK
Null values: 0


Column: TOV
Null values: 0


Column: PF
Null values: 0


Column: PTS
Null values: 0


Column: Year
Null values: 0




In [13]:
#fill the null vlaue with 0
players_df = players_df.fillna(0)

In [14]:
players_df.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,0.474,0.0,0.0,0.0,1.3,2.7,0.474,0.474,0.6,1.0,0.568,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991
1,Mahmoud Abdul-Rauf,PG,21,DEN,67,19,22.5,6.2,15.1,0.413,0.4,1.5,0.24,5.9,13.6,0.432,0.425,1.3,1.5,0.857,0.5,1.3,1.8,3.1,0.8,0.1,1.6,2.2,14.1,1991
2,Mark Acres,C,28,ORL,68,0,19.3,1.6,3.1,0.509,0.0,0.0,0.333,1.6,3.1,0.512,0.512,1.0,1.5,0.653,2.1,3.2,5.3,0.4,0.4,0.4,0.6,3.2,4.2,1991
3,Michael Adams,PG,28,DEN,66,66,35.5,8.5,21.5,0.394,2.5,8.5,0.296,6.0,13.0,0.459,0.453,7.0,8.0,0.879,0.9,3.0,3.9,10.5,2.2,0.1,3.6,2.5,26.5,1991
4,Mark Aguirre,SF,31,DET,78,13,25.7,5.4,11.7,0.462,0.3,1.0,0.308,5.1,10.7,0.477,0.475,3.1,4.1,0.757,1.7,3.1,4.8,1.8,0.6,0.3,1.6,2.7,14.2,1991


#### Teams DF Data Cleaning

In [15]:
#delete the unessary columns
del teams_record_df['Unnamed: 0']

In [16]:
#read the teams_record_df
teams_record_df.head()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,56,26,0.683,—,111.5,105.7,5.22,1991,Boston Celtics*
1,44,38,0.537,12.0,105.4,105.6,-0.39,1991,Philadelphia 76ers*
2,39,43,0.476,17.0,103.1,103.3,-0.43,1991,New York Knicks*
3,30,52,0.366,26.0,101.4,106.4,-4.84,1991,Washington Bullets
4,26,56,0.317,30.0,102.9,107.5,-4.53,1991,New Jersey Nets


In [17]:
#remove the rows where the Team column contain the word Division
teams_record_df = teams_record_df[~teams_record_df['Team'].str.contains('Division')]

In [18]:
#remove the '*' from the Team column
teams_record_df['Team'] = teams_record_df['Team'].str.rstrip('*')

In [19]:
#replace the '-' with 0
teams_record_df["GB"] = teams_record_df["GB"].str.replace("—", "0")

In [20]:
#read the df
teams_record_df.head()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,56,26,0.683,0.0,111.5,105.7,5.22,1991,Boston Celtics
1,44,38,0.537,12.0,105.4,105.6,-0.39,1991,Philadelphia 76ers
2,39,43,0.476,17.0,103.1,103.3,-0.43,1991,New York Knicks
3,30,52,0.366,26.0,101.4,106.4,-4.84,1991,Washington Bullets
4,26,56,0.317,30.0,102.9,107.5,-4.53,1991,New Jersey Nets


In [21]:
#read the nba nicknames df 
nba_nicknames = pd.read_csv('nba_nicknames.txt')
nba_nicknames.head()

Unnamed: 0,Abbreviation,Name
0,ATL,Atlanta Hawks
1,BRK,Brooklyn Nets
2,BKN,Brooklyn Nets
3,BOS,Boston Celtics
4,CHA,Charlotte Bobcats


In [22]:
#creates abbreviation and name dictionary
abbreviation_dict = {row['Abbreviation']: row['Name'] for i, row in nba_nicknames.iterrows()}

In [23]:
#maps the team name to the abbreviations in the players df 
players_df["Team"] = players_df["Tm"].map(abbreviation_dict)
players_df = players_df.fillna(0)

In [24]:
#merge the cleaned players_df with the cleaned teams_record_df
cleaned_players_df = players_df.merge(teams_record_df,how="outer", on=["Team", "Year"])

#### Data Cleaning Combined DF

In [25]:
#read the df
cleaned_players_df.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,0.474,0.0,0.0,0.0,1.3,2.7,0.474,0.474,0.6,1.0,0.568,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991,Portland Trail Blazers,63,19,0.768,0,114.7,106.0,8.47
1,Danny Ainge,SG,31,POR,80,0,21.4,4.2,8.9,0.472,1.3,3.1,0.406,2.9,5.8,0.508,0.543,1.4,1.7,0.826,0.6,2.0,2.6,3.6,0.8,0.2,1.3,2.4,11.1,1991,Portland Trail Blazers,63,19,0.768,0,114.7,106.0,8.47
2,Mark Bryant,PF,25,POR,53,0,14.7,1.9,3.8,0.488,0.0,0.0,0.0,1.9,3.8,0.49,0.488,1.4,1.9,0.733,1.2,2.4,3.6,0.5,0.3,0.2,0.6,2.3,5.1,1991,Portland Trail Blazers,63,19,0.768,0,114.7,106.0,8.47
3,Wayne Cooper,C,34,POR,67,1,11.1,0.9,2.2,0.393,0.0,0.0,0.0,0.9,2.1,0.396,0.393,0.5,0.6,0.786,0.8,2.0,2.8,0.3,0.1,0.9,0.3,1.8,2.2,1991,Portland Trail Blazers,63,19,0.768,0,114.7,106.0,8.47
4,Walter Davis,SG,36,POR,71,14,20.9,5.7,12.1,0.468,0.2,0.5,0.306,5.5,11.6,0.475,0.474,1.5,1.6,0.915,1.0,1.5,2.5,1.8,1.1,0.0,1.2,2.1,13.0,1991,Portland Trail Blazers,63,19,0.768,0,114.7,106.0,8.47


In [26]:
#check for null values
for column in cleaned_players_df.columns:
    
    print("Column:", column)
    
    # Print count of null values for the column
    print("Null values:", cleaned_players_df[column].isnull().sum())
    
    print("\n" + "="*50 + "\n")


Column: Player
Null values: 0


Column: Pos
Null values: 0


Column: Age
Null values: 0


Column: Tm
Null values: 0


Column: G
Null values: 0


Column: GS
Null values: 0


Column: MP
Null values: 0


Column: FG
Null values: 0


Column: FGA
Null values: 0


Column: FG%
Null values: 0


Column: 3P
Null values: 0


Column: 3PA
Null values: 0


Column: 3P%
Null values: 0


Column: 2P
Null values: 0


Column: 2PA
Null values: 0


Column: 2P%
Null values: 0


Column: eFG%
Null values: 0


Column: FT
Null values: 0


Column: FTA
Null values: 0


Column: FT%
Null values: 0


Column: ORB
Null values: 0


Column: DRB
Null values: 0


Column: TRB
Null values: 0


Column: AST
Null values: 0


Column: STL
Null values: 0


Column: BLK
Null values: 0


Column: TOV
Null values: 0


Column: PF
Null values: 0


Column: PTS
Null values: 0


Column: Year
Null values: 0


Column: Team
Null values: 0


Column: W
Null values: 0


Column: L
Null values: 0


Column: W/L%
Null values: 0


Column: GB
Null value

In [27]:
#convert all dtypes to numeric values
cleaned_players_df = cleaned_players_df.apply(pd.to_numeric, errors="ignore")

In [28]:
#create a season column 

# Calculate the previous year
cleaned_players_df['Previous_Year'] = cleaned_players_df['Year'] - 1

# Create the modified "Year" column
cleaned_players_df['season'] = cleaned_players_df['Previous_Year'].astype(str) + '-' + cleaned_players_df['Year'].astype(str)

# Drop the columns that were used for calculations (optional)
cleaned_players_df.drop(['Previous_Year'], axis=1, inplace=True)

# Now the "Modified_Year" column represents the range of years 
cleaned_players_df.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,season
0,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,0.474,0.0,0.0,0.0,1.3,2.7,0.474,0.474,0.6,1.0,0.568,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991
1,Danny Ainge,SG,31,POR,80,0,21.4,4.2,8.9,0.472,1.3,3.1,0.406,2.9,5.8,0.508,0.543,1.4,1.7,0.826,0.6,2.0,2.6,3.6,0.8,0.2,1.3,2.4,11.1,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991
2,Mark Bryant,PF,25,POR,53,0,14.7,1.9,3.8,0.488,0.0,0.0,0.0,1.9,3.8,0.49,0.488,1.4,1.9,0.733,1.2,2.4,3.6,0.5,0.3,0.2,0.6,2.3,5.1,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991
3,Wayne Cooper,C,34,POR,67,1,11.1,0.9,2.2,0.393,0.0,0.0,0.0,0.9,2.1,0.396,0.393,0.5,0.6,0.786,0.8,2.0,2.8,0.3,0.1,0.9,0.3,1.8,2.2,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991
4,Walter Davis,SG,36,POR,71,14,20.9,5.7,12.1,0.468,0.2,0.5,0.306,5.5,11.6,0.475,0.474,1.5,1.6,0.915,1.0,1.5,2.5,1.8,1.1,0.0,1.2,2.1,13.0,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991


In [29]:
#save it to a csv file
cleaned_players_df.to_csv("cleaned_players.csv")

In [30]:
cleaned_players_df.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,season
0,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,0.474,0.0,0.0,0.0,1.3,2.7,0.474,0.474,0.6,1.0,0.568,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991
1,Danny Ainge,SG,31,POR,80,0,21.4,4.2,8.9,0.472,1.3,3.1,0.406,2.9,5.8,0.508,0.543,1.4,1.7,0.826,0.6,2.0,2.6,3.6,0.8,0.2,1.3,2.4,11.1,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991
2,Mark Bryant,PF,25,POR,53,0,14.7,1.9,3.8,0.488,0.0,0.0,0.0,1.9,3.8,0.49,0.488,1.4,1.9,0.733,1.2,2.4,3.6,0.5,0.3,0.2,0.6,2.3,5.1,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991
3,Wayne Cooper,C,34,POR,67,1,11.1,0.9,2.2,0.393,0.0,0.0,0.0,0.9,2.1,0.396,0.393,0.5,0.6,0.786,0.8,2.0,2.8,0.3,0.1,0.9,0.3,1.8,2.2,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991
4,Walter Davis,SG,36,POR,71,14,20.9,5.7,12.1,0.468,0.2,0.5,0.306,5.5,11.6,0.475,0.474,1.5,1.6,0.915,1.0,1.5,2.5,1.8,1.1,0.0,1.2,2.1,13.0,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991


In [31]:
#codes team with a number 
cleaned_players_df['Team Code'] = cleaned_players_df["Team"].astype('category').cat.codes

In [32]:
#codes player with a number 
cleaned_players_df['Player Code'] = cleaned_players_df["Player"].astype('category').cat.codes

In [33]:
#remove players that have only one season of data 
cleaned_players_df = cleaned_players_df.groupby('Player Code').filter(lambda x: x.shape[0]>1)

In [34]:
#read the df
cleaned_players_df.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,season,Team Code,Player Code
0,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,0.474,0.0,0.0,0.0,1.3,2.7,0.474,0.474,0.6,1.0,0.568,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991,28,45
1,Danny Ainge,SG,31,POR,80,0,21.4,4.2,8.9,0.472,1.3,3.1,0.406,2.9,5.8,0.508,0.543,1.4,1.7,0.826,0.6,2.0,2.6,3.6,0.8,0.2,1.3,2.4,11.1,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991,28,587
2,Mark Bryant,PF,25,POR,53,0,14.7,1.9,3.8,0.488,0.0,0.0,0.0,1.9,3.8,0.49,0.488,1.4,1.9,0.733,1.2,2.4,3.6,0.5,0.3,0.2,0.6,2.3,5.1,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991,28,1857
3,Wayne Cooper,C,34,POR,67,1,11.1,0.9,2.2,0.393,0.0,0.0,0.0,0.9,2.1,0.396,0.393,0.5,0.6,0.786,0.8,2.0,2.8,0.3,0.1,0.9,0.3,1.8,2.2,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991,28,2803
4,Walter Davis,SG,36,POR,71,14,20.9,5.7,12.1,0.468,0.2,0.5,0.306,5.5,11.6,0.475,0.474,1.5,1.6,0.915,1.0,1.5,2.5,1.8,1.1,0.0,1.2,2.1,13.0,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991,28,2795


In [35]:
#n_season takes in a player and adds a columns of their next points  
def n_season(player):
    player = player.sort_values("Year")
    player["Next Pts"] = player["PTS"].shift(-1)
    return player
#applies n_season to teams_players_df
cleaned_players_df = cleaned_players_df.groupby("Player Code", group_keys = False).apply(n_season)
cleaned_players_df[['Player', 'Year', 'PTS', 'Next Pts']]

Unnamed: 0,Player,Year,PTS,Next Pts
0,Alaa Abdelnaby,1991,3.1,6.1
1,Danny Ainge,1991,11.1,9.7
2,Mark Bryant,1991,5.1,4.1
3,Wayne Cooper,1991,2.2,2.2
4,Walter Davis,1991,13.0,9.9
...,...,...,...,...
15241,Shake Milton,2023,8.4,
15242,Georges Niang,2023,8.2,
15243,Paul Reed,2023,4.2,
15244,Jaden Springer,2023,2.6,


In [36]:
#create a df containing data where the year is 2023
df_2023 = cleaned_players_df[cleaned_players_df['Year'] == 2023]
df_2023.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,season,Team Code,Player Code,Next Pts
14707,Precious Achiuwa,C,23,TOR,55,12,20.7,3.6,7.3,0.485,0.5,2.0,0.269,3.0,5.4,0.564,0.521,1.6,2.3,0.702,1.8,4.1,6.0,0.9,0.6,0.5,1.1,1.9,9.2,2023,Toronto Raptors,41,41,0.5,16.0,112.9,111.4,1.59,2022-2023,32,2188,
14708,OG Anunoby,SF,25,TOR,67,67,35.6,6.3,13.2,0.476,2.1,5.5,0.387,4.2,7.7,0.539,0.556,2.1,2.5,0.838,1.4,3.5,5.0,2.0,1.9,0.7,2.0,3.0,16.8,2023,Toronto Raptors,41,41,0.5,16.0,112.9,111.4,1.59,2022-2023,32,2098,
14709,Dalano Banton,PG,23,TOR,31,2,9.0,1.8,4.2,0.423,0.5,1.6,0.294,1.3,2.5,0.506,0.481,0.5,0.8,0.708,0.4,1.1,1.5,1.2,0.4,0.4,0.6,1.1,4.6,2023,Toronto Raptors,41,41,0.5,16.0,112.9,111.4,1.59,2022-2023,32,552,
14710,Scottie Barnes,SF,21,TOR,77,76,34.8,6.0,13.2,0.456,0.8,2.9,0.281,5.2,10.3,0.505,0.487,2.5,3.2,0.772,2.3,4.3,6.6,4.8,1.1,0.8,2.0,2.2,15.3,2023,Toronto Raptors,41,41,0.5,16.0,112.9,111.4,1.59,2022-2023,32,2416,
14711,Will Barton,SG,32,TOR,56,2,17.7,2.5,6.5,0.379,1.2,3.2,0.367,1.3,3.3,0.391,0.47,0.7,0.8,0.787,0.3,2.2,2.4,2.0,0.5,0.2,0.9,1.0,6.8,2023,Toronto Raptors,41,41,0.5,16.0,112.9,111.4,1.59,2022-2023,32,2815,


In [37]:
#copies dataframe with all years 
cleaned_players_df = cleaned_players_df.copy()

In [38]:
#drops na rows 
cleaned_players_df = cleaned_players_df.dropna()
cleaned_players_df.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,season,Team Code,Player Code,Next Pts
0,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,0.474,0.0,0.0,0.0,1.3,2.7,0.474,0.474,0.6,1.0,0.568,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991,28,45,6.1
1,Danny Ainge,SG,31,POR,80,0,21.4,4.2,8.9,0.472,1.3,3.1,0.406,2.9,5.8,0.508,0.543,1.4,1.7,0.826,0.6,2.0,2.6,3.6,0.8,0.2,1.3,2.4,11.1,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991,28,587,9.7
2,Mark Bryant,PF,25,POR,53,0,14.7,1.9,3.8,0.488,0.0,0.0,0.0,1.9,3.8,0.49,0.488,1.4,1.9,0.733,1.2,2.4,3.6,0.5,0.3,0.2,0.6,2.3,5.1,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991,28,1857,4.1
3,Wayne Cooper,C,34,POR,67,1,11.1,0.9,2.2,0.393,0.0,0.0,0.0,0.9,2.1,0.396,0.393,0.5,0.6,0.786,0.8,2.0,2.8,0.3,0.1,0.9,0.3,1.8,2.2,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991,28,2803,2.2
4,Walter Davis,SG,36,POR,71,14,20.9,5.7,12.1,0.468,0.2,0.5,0.306,5.5,11.6,0.475,0.474,1.5,1.6,0.915,1.0,1.5,2.5,1.8,1.1,0.0,1.2,2.1,13.0,1991,Portland Trail Blazers,63,19,0.768,0.0,114.7,106.0,8.47,1990-1991,28,2795,9.9


## Building the Machine Learning Model 

In [39]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import MinMaxScaler

In [40]:
rr = Ridge(alpha=0.1)
split = TimeSeriesSplit(n_splits =5)
sfs = SequentialFeatureSelector(rr, n_features_to_select=27, direction="forward", cv=split, n_jobs=4)

In [41]:
#create a list of columns to remove
removed_cols = ["Next Pts", "Player", 'Tm', 'Team', 'Player Code', 'Team Code', 'Year', 'Pos', 'season']
#select all columns except for the removed ones 
selected_cols = cleaned_players_df.columns[~cleaned_players_df.columns.isin(removed_cols)]
selected_cols

Index(['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P',
       '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'PTS', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [42]:
# Create a MinMaxScaler object
scaler = MinMaxScaler()
# Apply the MinMaxScaler to selected columns in the DataFrame and replace the original values with scaled values
cleaned_players_df.loc[:, selected_cols] = scaler.fit_transform(cleaned_players_df[selected_cols])
cleaned_players_df.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,season,Team Code,Player Code,Next Pts
0,Alaa Abdelnaby,PF,0.166667,POR,0.5,0.0,0.153318,0.102362,0.097122,0.474,0.0,0.0,0.0,0.107438,0.115385,0.474,0.316,0.058824,0.076336,0.568,0.088235,0.113821,0.112299,0.021127,0.033333,0.066667,0.087719,0.15,0.085873,1991,Portland Trail Blazers,0.848485,0.15873,0.844388,0.0,0.858639,0.476793,0.874245,1990-1991,28,45,6.1
1,Danny Ainge,SG,0.541667,POR,0.940476,0.0,0.489703,0.330709,0.320144,0.472,0.245283,0.234848,0.406,0.239669,0.247863,0.508,0.362,0.137255,0.129771,0.826,0.088235,0.162602,0.139037,0.253521,0.266667,0.044444,0.22807,0.4,0.307479,1991,Portland Trail Blazers,0.848485,0.15873,0.844388,0.0,0.858639,0.476793,0.874245,1990-1991,28,587,9.7
2,Mark Bryant,PF,0.291667,POR,0.619048,0.0,0.336384,0.149606,0.136691,0.488,0.0,0.0,0.0,0.157025,0.162393,0.49,0.325333,0.137255,0.145038,0.733,0.176471,0.195122,0.192513,0.035211,0.1,0.044444,0.105263,0.383333,0.141274,1991,Portland Trail Blazers,0.848485,0.15873,0.844388,0.0,0.858639,0.476793,0.874245,1990-1991,28,1857,4.1
3,Wayne Cooper,C,0.666667,POR,0.785714,0.012048,0.254005,0.070866,0.079137,0.393,0.0,0.0,0.0,0.07438,0.089744,0.396,0.262,0.04902,0.045802,0.786,0.117647,0.162602,0.149733,0.021127,0.033333,0.2,0.052632,0.3,0.060942,1991,Portland Trail Blazers,0.848485,0.15873,0.844388,0.0,0.858639,0.476793,0.874245,1990-1991,28,2803,2.2
4,Walter Davis,SG,0.75,POR,0.833333,0.168675,0.478261,0.448819,0.435252,0.468,0.037736,0.037879,0.306,0.454545,0.495726,0.475,0.316,0.147059,0.122137,0.915,0.147059,0.121951,0.13369,0.126761,0.366667,0.0,0.210526,0.35,0.360111,1991,Portland Trail Blazers,0.848485,0.15873,0.844388,0.0,0.858639,0.476793,0.874245,1990-1991,28,2795,9.9


In [43]:
#fit the model 
sfs.fit(cleaned_players_df[selected_cols],cleaned_players_df["Next Pts"])

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
                          estimator=Ridge(alpha=0.1), n_features_to_select=27,
                          n_jobs=4)

In [44]:
#list of predictors 
predictors = list(selected_cols[sfs.get_support()])
predictors

['Age',
 'G',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '2P',
 '2P%',
 'eFG%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS',
 'L',
 'W/L%',
 'GB',
 'PS/G']

In [45]:
#create a backtest method
def backtest(data, model, predictors, start=5, step=1):
    # Prediction for a single year
    all_pred = []
    years = sorted(data["Year"].unique())
    # Starts from the fifth year and increases by one each year each time
    for i in range(start, len(years), step):
        current_year = years[i]
        # Trains all data up to the current year
        train = data[data["Year"] < current_year]
        # Test the data for that year
        test = data[data["Year"] == current_year]
        # Fits the model
        model.fit(train[predictors], train["Next Pts"])
        # Makes the prediction
        preds = model.predict(test[predictors])
        # Round predicted values to one decimal place
        preds = np.round(preds, 1)
        # Creates a DataFrame with columns actual and prediction
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next Pts"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_pred.append(combined)
    return pd.concat(all_pred, axis=0)


In [46]:
#calls backtest function 
predictions = backtest(cleaned_players_df, rr,predictors)
predictions

Unnamed: 0,actual,prediction
1973,13.7,17.3
1975,16.6,11.4
1976,21.9,9.8
1977,0.3,3.4
1979,6.2,4.7
...,...,...
14702,3.0,4.4
14703,12.6,11.2
14704,14.8,10.4
14705,9.1,10.0


In [47]:
from sklearn.metrics import mean_squared_error

#calculates the mean squared error
mean_squared_error(predictions["actual"], predictions["prediction"])

8.854450688395616

In [48]:
from sklearn.metrics import mean_absolute_error

# Calculate MAE
mae = mean_absolute_error(predictions["actual"], predictions["prediction"])
print("MAE:", mae)

MAE: 2.2892104523742627


In [49]:
#describe the next pts columns
cleaned_players_df["Next Pts"].describe()

count    12363.000000
mean         8.942166
std          6.141485
min          0.000000
25%          4.200000
50%          7.500000
75%         12.500000
max         36.100000
Name: Next Pts, dtype: float64

In [50]:
def player_hist(df):
    df = df.sort_values("Year")
    df["player_year"] = range(0, df.shape[0])
    
    # Find the player's peak
    peak_year = df['PTS'].idxmax()
    peak_value = df.loc[peak_year, 'PTS']
    
    # Calculate the correlation between player_year and Pts, using the peak as a reference
    correlations = df[["player_year", 'PTS']].expanding().corr().loc[(slice(None), 'player_year'), 'PTS'].values
    df["Pts corr"] = correlations
    df["Pts corr"].fillna(1, inplace=True)
    
    # Adjust the correlation value by how close it is to the player's peak and which side of the peak the player is on
    proximity_to_peak = 1 - abs(peak_year - df['player_year']) / df.shape[0]
    performance_relative_to_peak = np.sign(peak_value - df['PTS'])
    adjusted_corr = correlations * proximity_to_peak * performance_relative_to_peak
    df["Adj Pts corr"] = adjusted_corr
    
    df["Pts diff"] = df['PTS'] / df["PTS"].shift(1)
    df["Pts diff"].fillna(1, inplace=True)
    df["Pts diff"][df["Pts diff"] == np.inf] = 1
    
    return df

# Assuming 'teams_players_df' is your DataFrame containing player data
cleaned_players_df = cleaned_players_df.groupby('Player Code', group_keys=False).apply(player_hist)

In [51]:
#fill null values with 0
cleaned_players_df= cleaned_players_df.fillna(0)

In [52]:
#compares players points to the average points score in that year  
def group_avg(df):
    return df["PTS"] / df["PTS"].mean()
cleaned_players_df["pts_year"] = cleaned_players_df.groupby("Year",group_keys=False).apply(group_avg)

In [53]:
#read the cleaned_players_df
cleaned_players_df.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,season,Team Code,Player Code,Next Pts,player_year,Pts corr,Adj Pts corr,Pts diff,pts_year
0,Alaa Abdelnaby,PF,0.166667,POR,0.5,0.0,0.153318,0.102362,0.097122,0.474,0.0,0.0,0.0,0.107438,0.115385,0.474,0.316,0.058824,0.076336,0.568,0.088235,0.113821,0.112299,0.021127,0.033333,0.066667,0.087719,0.15,0.085873,1991,Portland Trail Blazers,0.848485,0.15873,0.844388,0.0,0.858639,0.476793,0.874245,1990-1991,28,45,6.1,0,1.0,0.0,1.0,0.315688
1,Danny Ainge,SG,0.541667,POR,0.940476,0.0,0.489703,0.330709,0.320144,0.472,0.245283,0.234848,0.406,0.239669,0.247863,0.508,0.362,0.137255,0.129771,0.826,0.088235,0.162602,0.139037,0.253521,0.266667,0.044444,0.22807,0.4,0.307479,1991,Portland Trail Blazers,0.848485,0.15873,0.844388,0.0,0.858639,0.476793,0.874245,1990-1991,28,587,9.7,0,1.0,0.0,1.0,1.130367
2,Mark Bryant,PF,0.291667,POR,0.619048,0.0,0.336384,0.149606,0.136691,0.488,0.0,0.0,0.0,0.157025,0.162393,0.49,0.325333,0.137255,0.145038,0.733,0.176471,0.195122,0.192513,0.035211,0.1,0.044444,0.105263,0.383333,0.141274,1991,Portland Trail Blazers,0.848485,0.15873,0.844388,0.0,0.858639,0.476793,0.874245,1990-1991,28,1857,4.1,0,1.0,0.0,1.0,0.519358
3,Wayne Cooper,C,0.666667,POR,0.785714,0.012048,0.254005,0.070866,0.079137,0.393,0.0,0.0,0.0,0.07438,0.089744,0.396,0.262,0.04902,0.045802,0.786,0.117647,0.162602,0.149733,0.021127,0.033333,0.2,0.052632,0.3,0.060942,1991,Portland Trail Blazers,0.848485,0.15873,0.844388,0.0,0.858639,0.476793,0.874245,1990-1991,28,2803,2.2,0,1.0,0.0,1.0,0.224037
4,Walter Davis,SG,0.75,POR,0.833333,0.168675,0.478261,0.448819,0.435252,0.468,0.037736,0.037879,0.306,0.454545,0.495726,0.475,0.316,0.147059,0.122137,0.915,0.147059,0.121951,0.13369,0.126761,0.366667,0.0,0.210526,0.35,0.360111,1991,Portland Trail Blazers,0.848485,0.15873,0.844388,0.0,0.858639,0.476793,0.874245,1990-1991,28,2795,9.9,0,1.0,0.0,1.0,1.323853


In [54]:
#Sort the dataframe by Player and Year
cleaned_players_df = cleaned_players_df.sort_values(by=['Player', 'Year'])
#checks if the player missed he previous year
cleaned_players_df['Missing_Prev_Year'] = cleaned_players_df.groupby('Player')['Year'].diff() > 1
#Convert boolean values to 1 and 0
cleaned_players_df['Missing_Prev_Year'] = cleaned_players_df['Missing_Prev_Year'].astype(int)

cleaned_players_df.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,season,Team Code,Player Code,Next Pts,player_year,Pts corr,Adj Pts corr,Pts diff,pts_year,Missing_Prev_Year
336,A.C. Green,PF,0.375,LAL,0.964286,0.253012,0.604119,0.244094,0.23741,0.476,0.018868,0.05303,0.2,0.247934,0.252137,0.507,0.324,0.264706,0.282443,0.738,0.367647,0.308943,0.336898,0.06338,0.233333,0.066667,0.210526,0.233333,0.252078,1991,Los Angeles Lakers,0.772727,0.238095,0.766582,0.089286,0.638743,0.341772,0.808535,1990-1991,14,0,13.6,0,1.0,0.0,1.0,0.926697,0
707,A.C. Green,PF,0.416667,LAL,0.964286,0.638554,0.810069,0.370079,0.352518,0.476,0.018868,0.05303,0.214,0.371901,0.388889,0.495,0.322,0.401961,0.427481,0.744,0.544118,0.455285,0.497326,0.098592,0.366667,0.088889,0.245614,0.283333,0.376731,1992,Los Angeles Lakers,0.545455,0.47619,0.533163,0.25,0.484293,0.381857,0.518505,1991-1992,14,0,12.8,1,1.0,-122.0,1.494505,1.399194,0
1077,A.C. Green,PF,0.458333,LAL,0.964286,0.662651,0.787185,0.362205,0.309353,0.537,0.037736,0.045455,0.348,0.363636,0.34188,0.55,0.365333,0.333333,0.351145,0.739,0.514706,0.422764,0.465241,0.098592,0.366667,0.111111,0.245614,0.3,0.354571,1993,Los Angeles Lakers,0.484848,0.539683,0.471939,0.410714,0.58377,0.466245,0.509063,1992-1993,14,0,14.7,2,0.77061,-93.937406,0.941176,1.310308,0
1231,A.C. Green,PF,0.5,PHO,0.964286,0.662651,0.789474,0.448819,0.406475,0.502,0.018868,0.030303,0.229,0.46281,0.465812,0.513,0.337333,0.313725,0.335878,0.735,0.5,0.471545,0.491979,0.119718,0.3,0.111111,0.210526,0.283333,0.407202,1994,Phoenix Suns,0.742424,0.269841,0.735969,0.125,0.688482,0.421941,0.731118,1993-1994,27,0,11.2,3,0.850631,-0.0,1.148437,1.615333,0
1633,A.C. Green,SF,0.541667,PHO,0.964286,0.626506,0.750572,0.299213,0.269784,0.504,0.09434,0.113636,0.339,0.272727,0.25641,0.547,0.359333,0.303922,0.320611,0.732,0.352941,0.471545,0.438503,0.105634,0.233333,0.088889,0.245614,0.3,0.310249,1995,Phoenix Suns,0.787879,0.222222,0.783163,0.0,0.751309,0.493671,0.700151,1994-1995,27,0,7.5,4,0.383014,-46.612751,0.761905,1.202233,0


In [55]:
# Sort the dataframe by Player and Year
cleaned_players_df = cleaned_players_df.sort_values(by=['Player', 'Year'])

# Calculate the total games played by each player over all years
total_games_played = cleaned_players_df.groupby('Player')['G'].sum().reset_index()
total_games_played.rename(columns={'G': 'Total_Games_Played'}, inplace=True)
cleaned_players_df = pd.merge(cleaned_players_df, total_games_played, on='Player', how='left')

#Calculate the average games played per year for each player
cleaned_players_df['Avg_Games_Per_Year'] = cleaned_players_df['Total_Games_Played'] / cleaned_players_df.groupby('Player')['Year'].transform('nunique')

#Calculate the change in games played compared to the previous year
cleaned_players_df['Change_Games_Played'] = cleaned_players_df.groupby('Player')['G'].diff()

cleaned_players_df.head()


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,season,Team Code,Player Code,Next Pts,player_year,Pts corr,Adj Pts corr,Pts diff,pts_year,Missing_Prev_Year,Total_Games_Played,Avg_Games_Per_Year,Change_Games_Played
0,A.C. Green,PF,0.375,LAL,0.964286,0.253012,0.604119,0.244094,0.23741,0.476,0.018868,0.05303,0.2,0.247934,0.252137,0.507,0.324,0.264706,0.282443,0.738,0.367647,0.308943,0.336898,0.06338,0.233333,0.066667,0.210526,0.233333,0.252078,1991,Los Angeles Lakers,0.772727,0.238095,0.766582,0.089286,0.638743,0.341772,0.808535,1990-1991,14,0,13.6,0,1.0,0.0,1.0,0.926697,0,9.27381,0.927381,
1,A.C. Green,PF,0.416667,LAL,0.964286,0.638554,0.810069,0.370079,0.352518,0.476,0.018868,0.05303,0.214,0.371901,0.388889,0.495,0.322,0.401961,0.427481,0.744,0.544118,0.455285,0.497326,0.098592,0.366667,0.088889,0.245614,0.283333,0.376731,1992,Los Angeles Lakers,0.545455,0.47619,0.533163,0.25,0.484293,0.381857,0.518505,1991-1992,14,0,12.8,1,1.0,-122.0,1.494505,1.399194,0,9.27381,0.927381,0.0
2,A.C. Green,PF,0.458333,LAL,0.964286,0.662651,0.787185,0.362205,0.309353,0.537,0.037736,0.045455,0.348,0.363636,0.34188,0.55,0.365333,0.333333,0.351145,0.739,0.514706,0.422764,0.465241,0.098592,0.366667,0.111111,0.245614,0.3,0.354571,1993,Los Angeles Lakers,0.484848,0.539683,0.471939,0.410714,0.58377,0.466245,0.509063,1992-1993,14,0,14.7,2,0.77061,-93.937406,0.941176,1.310308,0,9.27381,0.927381,0.0
3,A.C. Green,PF,0.5,PHO,0.964286,0.662651,0.789474,0.448819,0.406475,0.502,0.018868,0.030303,0.229,0.46281,0.465812,0.513,0.337333,0.313725,0.335878,0.735,0.5,0.471545,0.491979,0.119718,0.3,0.111111,0.210526,0.283333,0.407202,1994,Phoenix Suns,0.742424,0.269841,0.735969,0.125,0.688482,0.421941,0.731118,1993-1994,27,0,11.2,3,0.850631,-0.0,1.148437,1.615333,0,9.27381,0.927381,0.0
4,A.C. Green,SF,0.541667,PHO,0.964286,0.626506,0.750572,0.299213,0.269784,0.504,0.09434,0.113636,0.339,0.272727,0.25641,0.547,0.359333,0.303922,0.320611,0.732,0.352941,0.471545,0.438503,0.105634,0.233333,0.088889,0.245614,0.3,0.310249,1995,Phoenix Suns,0.787879,0.222222,0.783163,0.0,0.751309,0.493671,0.700151,1994-1995,27,0,7.5,4,0.383014,-46.612751,0.761905,1.202233,0,9.27381,0.927381,0.0


In [56]:
#Sort the dataframe by Player and Year
cleaned_players_df = cleaned_players_df.sort_values(by=['Player', 'Year'])

#Calculate the change in games played compared to the previous year
cleaned_players_df['Change_Games_Played'] = cleaned_players_df.groupby('Player')['G'].diff()

#Define a threshold to identify a significant drop in games played
significant_drop_threshold = -0.5  

# Create a binary feature indicating significant drop in games played
cleaned_players_df['Significant_Drop_Games_Played'] = (cleaned_players_df['Change_Games_Played'] <= significant_drop_threshold).astype(int)

cleaned_players_df.head()


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,season,Team Code,Player Code,Next Pts,player_year,Pts corr,Adj Pts corr,Pts diff,pts_year,Missing_Prev_Year,Total_Games_Played,Avg_Games_Per_Year,Change_Games_Played,Significant_Drop_Games_Played
0,A.C. Green,PF,0.375,LAL,0.964286,0.253012,0.604119,0.244094,0.23741,0.476,0.018868,0.05303,0.2,0.247934,0.252137,0.507,0.324,0.264706,0.282443,0.738,0.367647,0.308943,0.336898,0.06338,0.233333,0.066667,0.210526,0.233333,0.252078,1991,Los Angeles Lakers,0.772727,0.238095,0.766582,0.089286,0.638743,0.341772,0.808535,1990-1991,14,0,13.6,0,1.0,0.0,1.0,0.926697,0,9.27381,0.927381,,0
1,A.C. Green,PF,0.416667,LAL,0.964286,0.638554,0.810069,0.370079,0.352518,0.476,0.018868,0.05303,0.214,0.371901,0.388889,0.495,0.322,0.401961,0.427481,0.744,0.544118,0.455285,0.497326,0.098592,0.366667,0.088889,0.245614,0.283333,0.376731,1992,Los Angeles Lakers,0.545455,0.47619,0.533163,0.25,0.484293,0.381857,0.518505,1991-1992,14,0,12.8,1,1.0,-122.0,1.494505,1.399194,0,9.27381,0.927381,0.0,0
2,A.C. Green,PF,0.458333,LAL,0.964286,0.662651,0.787185,0.362205,0.309353,0.537,0.037736,0.045455,0.348,0.363636,0.34188,0.55,0.365333,0.333333,0.351145,0.739,0.514706,0.422764,0.465241,0.098592,0.366667,0.111111,0.245614,0.3,0.354571,1993,Los Angeles Lakers,0.484848,0.539683,0.471939,0.410714,0.58377,0.466245,0.509063,1992-1993,14,0,14.7,2,0.77061,-93.937406,0.941176,1.310308,0,9.27381,0.927381,0.0,0
3,A.C. Green,PF,0.5,PHO,0.964286,0.662651,0.789474,0.448819,0.406475,0.502,0.018868,0.030303,0.229,0.46281,0.465812,0.513,0.337333,0.313725,0.335878,0.735,0.5,0.471545,0.491979,0.119718,0.3,0.111111,0.210526,0.283333,0.407202,1994,Phoenix Suns,0.742424,0.269841,0.735969,0.125,0.688482,0.421941,0.731118,1993-1994,27,0,11.2,3,0.850631,-0.0,1.148437,1.615333,0,9.27381,0.927381,0.0,0
4,A.C. Green,SF,0.541667,PHO,0.964286,0.626506,0.750572,0.299213,0.269784,0.504,0.09434,0.113636,0.339,0.272727,0.25641,0.547,0.359333,0.303922,0.320611,0.732,0.352941,0.471545,0.438503,0.105634,0.233333,0.088889,0.245614,0.3,0.310249,1995,Phoenix Suns,0.787879,0.222222,0.783163,0.0,0.751309,0.493671,0.700151,1994-1995,27,0,7.5,4,0.383014,-46.612751,0.761905,1.202233,0,9.27381,0.927381,0.0,0


In [57]:
#Calculate the overall injury metric
cleaned_players_df['Overall_Injury_Metric'] = (
    cleaned_players_df['Missing_Prev_Year'] * 0.2 +
    cleaned_players_df['Total_Games_Played'] * 0.1 +
    cleaned_players_df['Avg_Games_Per_Year'] * 0.1 +
    cleaned_players_df['Change_Games_Played'] * 0.3 +
    cleaned_players_df['Significant_Drop_Games_Played'] * 0.3
)

# Normalize the overall injury metric to the range [0, 1]
cleaned_players_df['Normalized_Injury_Metric'] = (cleaned_players_df['Overall_Injury_Metric'] - cleaned_players_df['Overall_Injury_Metric'].min()) / (cleaned_players_df['Overall_Injury_Metric'].max() - cleaned_players_df['Overall_Injury_Metric'].min())

cleaned_players_df.head()


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,season,Team Code,Player Code,Next Pts,player_year,Pts corr,Adj Pts corr,Pts diff,pts_year,Missing_Prev_Year,Total_Games_Played,Avg_Games_Per_Year,Change_Games_Played,Significant_Drop_Games_Played,Overall_Injury_Metric,Normalized_Injury_Metric
0,A.C. Green,PF,0.375,LAL,0.964286,0.253012,0.604119,0.244094,0.23741,0.476,0.018868,0.05303,0.2,0.247934,0.252137,0.507,0.324,0.264706,0.282443,0.738,0.367647,0.308943,0.336898,0.06338,0.233333,0.066667,0.210526,0.233333,0.252078,1991,Los Angeles Lakers,0.772727,0.238095,0.766582,0.089286,0.638743,0.341772,0.808535,1990-1991,14,0,13.6,0,1.0,0.0,1.0,0.926697,0,9.27381,0.927381,,0,,
1,A.C. Green,PF,0.416667,LAL,0.964286,0.638554,0.810069,0.370079,0.352518,0.476,0.018868,0.05303,0.214,0.371901,0.388889,0.495,0.322,0.401961,0.427481,0.744,0.544118,0.455285,0.497326,0.098592,0.366667,0.088889,0.245614,0.283333,0.376731,1992,Los Angeles Lakers,0.545455,0.47619,0.533163,0.25,0.484293,0.381857,0.518505,1991-1992,14,0,12.8,1,1.0,-122.0,1.494505,1.399194,0,9.27381,0.927381,0.0,0,1.020119,0.542604
2,A.C. Green,PF,0.458333,LAL,0.964286,0.662651,0.787185,0.362205,0.309353,0.537,0.037736,0.045455,0.348,0.363636,0.34188,0.55,0.365333,0.333333,0.351145,0.739,0.514706,0.422764,0.465241,0.098592,0.366667,0.111111,0.245614,0.3,0.354571,1993,Los Angeles Lakers,0.484848,0.539683,0.471939,0.410714,0.58377,0.466245,0.509063,1992-1993,14,0,14.7,2,0.77061,-93.937406,0.941176,1.310308,0,9.27381,0.927381,0.0,0,1.020119,0.542604
3,A.C. Green,PF,0.5,PHO,0.964286,0.662651,0.789474,0.448819,0.406475,0.502,0.018868,0.030303,0.229,0.46281,0.465812,0.513,0.337333,0.313725,0.335878,0.735,0.5,0.471545,0.491979,0.119718,0.3,0.111111,0.210526,0.283333,0.407202,1994,Phoenix Suns,0.742424,0.269841,0.735969,0.125,0.688482,0.421941,0.731118,1993-1994,27,0,11.2,3,0.850631,-0.0,1.148437,1.615333,0,9.27381,0.927381,0.0,0,1.020119,0.542604
4,A.C. Green,SF,0.541667,PHO,0.964286,0.626506,0.750572,0.299213,0.269784,0.504,0.09434,0.113636,0.339,0.272727,0.25641,0.547,0.359333,0.303922,0.320611,0.732,0.352941,0.471545,0.438503,0.105634,0.233333,0.088889,0.245614,0.3,0.310249,1995,Phoenix Suns,0.787879,0.222222,0.783163,0.0,0.751309,0.493671,0.700151,1994-1995,27,0,7.5,4,0.383014,-46.612751,0.761905,1.202233,0,9.27381,0.927381,0.0,0,1.020119,0.542604


In [58]:
#Remove rows where the name column contains "Khyri Thomas"
cleaned_players_df = cleaned_players_df[~cleaned_players_df['Player'].str.contains("Khyri Thomas")]

In [59]:
#fill null values with 0
cleaned_players_df = cleaned_players_df.fillna(0)

In [60]:
#get the new_predictors
new_predictors = predictors + ['player_year', 'Pts diff', 'pts_year', 'Pts corr', 'Missing_Prev_Year', 'Avg_Games_Per_Year', 'Change_Games_Played', 'Total_Games_Played', 'Significant_Drop_Games_Played', 'Normalized_Injury_Metric', 'Adj Pts corr']
new_predictors

['Age',
 'G',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '2P',
 '2P%',
 'eFG%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS',
 'L',
 'W/L%',
 'GB',
 'PS/G',
 'player_year',
 'Pts diff',
 'pts_year',
 'Pts corr',
 'Missing_Prev_Year',
 'Avg_Games_Per_Year',
 'Change_Games_Played',
 'Total_Games_Played',
 'Significant_Drop_Games_Played',
 'Normalized_Injury_Metric',
 'Adj Pts corr']

In [61]:
#makes the predictions 
predictions = backtest(cleaned_players_df, rr, new_predictors)
predictions

Unnamed: 0,actual,prediction
5,7.2,7.0
48,5.2,12.3
81,4.0,5.4
88,3.8,6.9
115,2.2,1.6
...,...,...
12265,5.6,5.3
12269,11.6,7.3
12277,24.8,22.1
12310,5.2,5.7


In [62]:
print(mean_squared_error(predictions["actual"], predictions["prediction"]))


7.842888056206089


In [63]:
mae = mean_absolute_error(predictions["actual"], predictions["prediction"])
print("MAE:", mae)

MAE: 2.1238688524590166


In [64]:
#merge the players df and the predictions fg 
merged = predictions.merge(cleaned_players_df, left_index=True, right_index=True)
merged["diff"] = (predictions["actual"] - predictions["prediction"]).abs()
#checks the difference between actual and predicted in descending order
merge = merged[['Player', 'prediction', 'actual', 'Next Pts', 'diff', "Team", 'Year']].sort_values("diff",ascending=False)

In [65]:
#create the "Year_Predicted" column
merge['Year_Predicted'] = merge['Year'] + 1
merge['Year_Predicted'] = merge['Year'].astype(str) + '-' + merge['Year_Predicted'].astype(str)


In [66]:
#take out outliers
merge = merge[merge['prediction'] >= 0]

In [67]:
merge

Unnamed: 0,Player,prediction,actual,Next Pts,diff,Team,Year,Year_Predicted
4498,Gordon Hayward,19.3,2.0,2.0,17.3,Utah Jazz,2017,2017-2018
7823,MarShon Brooks,3.0,20.1,20.1,17.1,Los Angeles Lakers,2014,2014-2015
5024,JaKarr Sampson,3.6,20.0,20.0,16.4,Sacramento Kings,2018,2018-2019
341,Amar'e Stoudemire,25.0,8.7,8.7,16.3,Phoenix Suns,2005,2005-2006
10721,Skylar Mays,0.3,15.3,15.3,15.0,Atlanta Hawks,2022,2022-2023
...,...,...,...,...,...,...,...,...
10656,Shawn Kemp,18.0,18.0,18.0,0.0,Seattle SuperSonics,1997,1997-1998
10420,Scottie Pippen,19.1,19.1,19.1,0.0,Chicago Bulls,1997,1997-1998
10380,Scott Burrell,5.2,5.2,5.2,0.0,Golden State Warriors,1997,1997-1998
10104,Rony Seikaly,13.3,13.3,13.3,0.0,Orlando Magic,1997,1997-1998


## Predict 2023-2024 Players Data

In [68]:
#read in the data for prediction
new_data_for_prediction = pd.read_csv("new_data_2023-2024.csv")

In [69]:
#delete the unessary columns
del new_data_for_prediction['Unnamed: 0']

In [70]:
#get the new_predcitors
new_predictors

['Age',
 'G',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '2P',
 '2P%',
 'eFG%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS',
 'L',
 'W/L%',
 'GB',
 'PS/G',
 'player_year',
 'Pts diff',
 'pts_year',
 'Pts corr',
 'Missing_Prev_Year',
 'Avg_Games_Per_Year',
 'Change_Games_Played',
 'Total_Games_Played',
 'Significant_Drop_Games_Played',
 'Normalized_Injury_Metric',
 'Adj Pts corr']

In [71]:
#predict 2023-2024 points 
predicted_next_pts = rr.predict(new_data_for_prediction[new_predictors])

In [72]:
#round the predicted points to 1 decimal place
predicted_next_pts = np.round(predicted_next_pts, 1)

In [73]:
#create a df
predictions_2023_2024 = pd.DataFrame({
    "actual": np.nan,  # Since the season hasn't happened yet
    "prediction": predicted_next_pts
})

In [74]:
predictions_2023_2024.head()

Unnamed: 0,actual,prediction
0,,14.5
1,,4.1
2,,8.1
3,,3.0
4,,3.6


In [75]:
#merge the players df and the predictions fg 
new_merged = predictions_2023_2024.merge(new_data_for_prediction, left_index=True, right_index=True)
new_merged["diff"] = (predictions_2023_2024["actual"] - predictions_2023_2024["prediction"]).abs()
#checks the difference between actual and predicted in descending order
new_merge = new_merged[['Player', 'prediction', 'actual', 'Next Pts', 'diff', "Team", 'Year']].sort_values("diff",ascending=False)

In [76]:
#remove all outliers
new_merge =  new_merge[new_merge['prediction'] >= 0]

In [77]:
#Create the Year_Predicted column
new_merge['Year_Predicted'] = new_merge['Year'] + 1
new_merge['Year_Predicted'] = new_merge['Year'].astype(str) + '-' + new_merge['Year_Predicted'].astype(str)

In [78]:
#clean the numbers
new_merge['prediction'] = new_merge['prediction'].replace(-0.0, 0.0)

In [79]:
#sort the df
new_merge.sort_values("prediction",ascending=False).reset_index()

Unnamed: 0,index,Player,prediction,actual,Next Pts,diff,Team,Year,Year_Predicted
0,286,Luka Dončić,30.1,,,,Dallas Mavericks,2023,2023-2024
1,211,Joel Embiid,29.6,,,,Philadelphia 76ers,2023,2023-2024
2,143,Giannis Antetokounmpo,28.6,,,,Milwaukee Bucks,2023,2023-2024
3,389,Shai Gilgeous-Alexander,28.2,,,,Oklahoma City Thunder,2023,2023-2024
4,74,Damian Lillard,28.1,,,,Portland Trail Blazers,2023,2023-2024
...,...,...,...,...,...,...,...,...,...
437,127,Facundo Campazzo,0.5,,,,Dallas Mavericks,2023,2023-2024
438,344,Olivier Sarr,0.4,,,,Oklahoma City Thunder,2023,2023-2024
439,106,Devon Dotson,0.3,,,,Washington Wizards,2023,2023-2024
440,408,Terry Taylor,0.1,,,,Chicago Bulls,2023,2023-2024


In [80]:
#merge the predictions 
full_merge =  pd.concat([merge, new_merge], ignore_index=True)

In [81]:
full_merge['diff'] = round(full_merge['diff'], 1)

In [85]:
full_merge = full_merge.sort_values("Year_Predicted",ascending=True)

In [86]:
#view the full merge
full_merge.to_csv("full_merge.csv")

In [87]:
full_merge

Unnamed: 0,Player,prediction,actual,Next Pts,diff,Team,Year,Year_Predicted
5822,Michael Finley,16.5,15.0,15.0,1.5,Phoenix Suns,1996,1996-1997
1264,Derek Strong,4.2,8.5,8.5,4.3,Los Angeles Lakers,1996,1996-1997
3004,Doug West,5.0,7.8,7.8,2.8,Minnesota Timberwolves,1996,1996-1997
1829,Voshon Lenard,8.7,12.3,12.3,3.6,Miami Heat,1996,1996-1997
9982,Sam Perkins,11.2,11.0,11.0,0.2,Seattle SuperSonics,1996,1996-1997
...,...,...,...,...,...,...,...,...
10760,Giannis Antetokounmpo,28.6,,,,Milwaukee Bucks,2023,2023-2024
10759,Georges Niang,7.8,,,,Philadelphia 76ers,2023,2023-2024
10758,George Hill,3.6,,,,Indiana Pacers,2023,2023-2024
10769,Harrison Barnes,13.5,,,,Sacramento Kings,2023,2023-2024


In [97]:
merge = merge.sort_values("Year_Predicted",ascending=True)

In [98]:
merge


Unnamed: 0,Player,prediction,actual,Next Pts,diff,Team,Year,Year_Predicted
4806,Howard Eisley,7.5,4.5,4.5,3.0,Utah Jazz,1996,1996-1997
11798,Tyrone Hill,8.3,12.9,12.9,4.6,Cleveland Cavaliers,1996,1996-1997
3047,Dell Curry,13.4,14.8,14.8,1.4,Charlotte Hornets,1996,1996-1997
11559,Tracy Murray,14.0,10.0,10.0,4.0,Toronto Raptors,1996,1996-1997
1598,Calbert Cheaney,14.6,10.6,10.6,4.0,Washington Bullets,1996,1996-1997
...,...,...,...,...,...,...,...,...
6007,Joe Wieskamp,3.2,1.0,1.0,2.2,San Antonio Spurs,2022,2022-2023
5590,Jay Huff,2.2,7.3,7.3,5.1,Los Angeles Lakers,2022,2022-2023
7196,Kyle Lowry,10.3,11.2,11.2,0.9,Miami Heat,2022,2022-2023
1938,Chris Boucher,8.6,9.4,9.4,0.8,Toronto Raptors,2022,2022-2023


In [99]:
cleaned_players_new = pd.read_csv("cleaned_players.csv")

In [102]:
merge.to_csv("prediction_2022.csv")

In [104]:
new_merge.to_csv("prediction_2023.csv")