# **Predicting NBA MVP**

**üìç Contributors**
<br>
- Evgeny Smirnov
- Matteo Torlone
- Steven Wilson

<br>
The objective of this project is to develop a predictive model that can accurately forecast the Most Valuable Player (MVP) of the NBA based on a combination of individual player performance metrics, team performance metrics, and other relevant factors.

### **Imports**

In [None]:
import pandas as pd
import itertools
import numpy as np

# Defining seasons to be used
season = '2021-22'

### **Extract Data**

In [None]:

# Data hosted on GitHub
url_per_game = "https://raw.githubusercontent.com/steven-n-wilson/NBA-MVP/main/data/2021-22%20Per%20Game.csv"
url_total = "https://raw.githubusercontent.com/steven-n-wilson/NBA-MVP/main/data/2021-22%20Total.csv"
url_advanced = "https://raw.githubusercontent.com/steven-n-wilson/NBA-MVP/main/data/2021-22%20Advanced.csv"
url_standings = "https://raw.githubusercontent.com/steven-n-wilson/NBA-MVP/main/data/2021-22%20Standings.csv"
url_mvps = "https://raw.githubusercontent.com/steven-n-wilson/NBA-MVP/main/data/MVPs.csv"

# Initializing dataframes
per_game = pd.read_csv(url_per_game)
total = pd.read_csv(url_total)
advanced = pd.read_csv(url_advanced)
standings = pd.read_csv(url_standings)
mvps = pd.read_csv(url_mvps)


### **Transform Data**

In [None]:
def transform_data(per_game, total, advanced, standings, mvps):

  # Removing duplicate columns
    per_game = per_game.drop(['Rk','Pos'], axis=1)
    total = total.drop(['Rk','Pos','Age','G','GS'], axis=1)
    advanced = advanced.drop(['Rk','Pos','Age','G','MP','Unnamed: 24','Unnamed: 19'], axis=1) # also removes blank columns

    columns = ['Player','Season','Pos','Age','Tm','G','GS']

  # Identifying and renaming table specfic columns
    for col in per_game.columns:
      if col not in columns:
        new_column = col + "_PERGAME"
        per_game = per_game.rename(columns={col:new_column})

    for col in total.columns:
      if col not in columns:
        new_column = col + "_TOTAL"
        total = total.rename(columns={col:new_column})

    for col in advanced.columns:
      if col not in columns:
        new_column = col + "_ADVANCED"
        advanced = advanced.rename(columns={col:new_column})

  # Merging dataframes (per_game, advanced, total)
    data = per_game.merge(advanced, on=['Player', 'Tm'], how='left', validate='1:1')
    data = data.merge(total, on=['Player', 'Tm'], how='left', validate='1:1')

  # Dictionary of NBA teams with their corresponding abbreviations
    dict_teams = {
        'Utah Jazz': 'UTA', 'Phoenix Suns': 'PHO', 'Philadelphia 76ers': 'PHI', 'Brooklyn Nets': 'BRK',
        'Denver Nuggets': 'DEN', 'Los Angeles Clippers': 'LAC', 'Milwaukee Bucks': 'MIL', 'Dallas Mavericks': 'DAL',
        'Los Angeles Lakers': 'LAL', 'Portland Trail Blazers': 'POR', 'Atlanta Hawks': 'ATL', 'New York Knicks': 'NYK',
        'Miami Heat': 'MIA', 'Golden State Warriors': 'GSW', 'Memphis Grizzlies': 'MEM', 'Boston Celtics': 'BOS',
        'Washington Wizards': 'WAS', 'Indiana Pacers': 'IND', 'Charlotte Hornets': 'CHO', 'Charlotte Bobcats': 'CHA',
        'San Antonio Spurs': 'SAS', 'Chicago Bulls': 'CHI', 'New Orleans Pelicans': 'NOP', 'Sacramento Kings': 'SAC',
        'Toronto Raptors': 'TOR', 'Minnesota Timberwolves': 'MIN', 'Cleveland Cavaliers': 'CLE',
        'Oklahoma City Thunder': 'OKC', 'Orlando Magic': 'ORL', 'Detroit Pistons': 'DET', 'Houston Rockets': 'HOU',
        'New Jersey Nets': 'NJN', 'New Orleans Hornets': 'NOH', 'Seattle SuperSonics': 'SEA'
    }

  # Creates a datframe for teams and abbreviations
    teams = pd.DataFrame.from_dict(dict_teams, orient='index').reset_index()
    teams = teams.rename(columns={'index':'Team', 0:'Tm'}) # changes the names of the columns

  # Merge dataframes (standings, teams)
    standings = standings.merge(teams, on='Team', how='left', validate='m:1')

    wins = standings['Record'].str.split('-', expand=True)[0].astype(int) # extracts wins from dataframe
    losses = standings['Record'].str.split('-', expand=True)[1].astype(int) # extracts losses from dataframe
    total_games = wins + losses
    standings['PCT'] = wins / total_games # Calculates the percentage of wins during the season

  # Merge dataframes (data, standings)
    data = data.merge(standings, on=['Tm'], how='left', validate='m:1')

    mvps = mvps[mvps['Season'] == season] # from the mvps data frame, we need to select only the players from the desired season.
    data = data.merge(mvps, on=['Player'], how='left', validate='m:1').fillna(0) # fillna(0) method is used to fill missing (na) values with a specified value, in this case, 0.

    data['Player'] = data['Player'].str.split('\\', expand=True)[0] # this lines cleans the name formatting: Nikola Jokiƒá\jokicni01 -> Nikola Jokiƒá

  # Removes duplicate lines from injured players or transfers
    data = data.drop_duplicates(subset='Player', keep='first')


  # Filter to identify players who meet or exceed certain statistical thresholds that are
  # commonly associated with MVP-caliber performances

  # # ** TIGHT FILTER **
  #   data = data[
  #       (
  #           # Player must have played a significant portion of the season
  #           (data['G'] > 48) &
  #           # MVP candidates often lead their teams in scoring
  #           (data['PTS_PERGAME'] > 20) &
  #           # MVP candidates typically play significant minutes
  #           (data['MP_PERGAME'] > 30) &
  #           # MVP candidates often come from top-seeded teams
  #           (data['Seed'] <= 4) &
  #           # MVP candidates are often strong playmakers
  #           (data['AST_PERGAME'] > 7) &
  #           # MVP candidates are usually strong rebounders
  #           (data['TRB_PERGAME'] > 7) &
  #           # Players must have played for a specific team (not traded mid-season)
  #           (data['Tm'] != 'TOT') &
  #           # MVP candidates typically have a high field goal percentage
  #           (data['FG%_PERGAME'] > 0.37) &
  #           # MVP candidates often attempt a significant number of field goals
  #           (data['FGA_PERGAME'] > 10) &
  #           # MVP candidates typically have high Player Efficiency Ratings
  #           (data['PER_ADVANCED'] > 20)
  #       ) |
  #       # Players who received MVP votes share above 5% are considered strong candidates
  #       (data['MVP Votes Share'] > 0.05)
  #   ].reset_index(drop=True)


  # # ** LOOSE FILTER **
  #   data = data[
  #     (
  #         (data['G'] > 48) &
  #         (data['PTS_PERGAME'] > 13.5) &
  #         (data['MP_PERGAME'] > 30) &
  #         (data['Seed'] <= 16) &
  #         (data['AST_PERGAME'] > 1) &
  #         (data['TRB_PERGAME'] > 3) &
  #         (data['Tm'] != 'TOT') &
  #         (data['FG%_PERGAME'] > 0.37) &
  #         (data['FGA_PERGAME'] > 10) &
  #         (data['PER_ADVANCED'] > 18)
  #     ) | (data['MVP Votes Share'] > 0)
  # ].reset_index(drop=True)

    # Removes unecessary team information
    data = data.drop(['Tm','Team','Record'], axis=1)

    # Generate sequential surrogate keys starting from 1
    data['Surrogate_Key'] = range(1, len(data) + 1)
    data.insert(0, 'Surrogate_Key', data.pop('Surrogate_Key')) # reorder Surrogate_Key column as the first column

    return data

In [None]:
data = transform_data(per_game, total, advanced, standings, mvps)

# Generate CSV
# data.to_csv('data.csv', index=False)  # Set index=False to exclude the index from the CSV

data.head(10)

Unnamed: 0,Surrogate_Key,Player,Age,G,GS,MP_PERGAME,FG_PERGAME,FGA_PERGAME,FG%_PERGAME,3P_PERGAME,...,STL_TOTAL,BLK_TOTAL,TOV_TOTAL,PF_TOTAL,PTS_TOTAL,Seed,PCT,MVP Rank,MVP Votes Share,Season
0,1,Precious Achiuwa,22,73,28,23.6,3.6,8.3,0.439,0.8,...,37,41,84,151,664,11.0,0.585366,0,0.0,0
1,2,Steven Adams,28,76,75,26.3,2.8,5.1,0.547,0.0,...,65,60,115,153,528,2.0,0.682927,0,0.0,0
2,3,Bam Adebayo,24,56,56,32.6,7.3,13.0,0.557,0.0,...,80,44,148,171,1068,4.0,0.646341,0,0.0,0
3,4,Santi Aldama,21,32,0,11.3,1.7,4.1,0.402,0.2,...,6,10,16,36,132,2.0,0.682927,0,0.0,0
4,5,LaMarcus Aldridge,36,47,12,22.3,5.4,9.7,0.55,0.3,...,14,47,44,78,607,14.0,0.536585,0,0.0,0
5,6,Nickeil Alexander-Walker,23,65,21,22.6,3.9,10.5,0.372,1.6,...,46,23,93,103,692,0.0,0.0,0,0.0,0
8,7,Grayson Allen,26,66,61,27.3,3.9,8.6,0.448,2.4,...,46,18,43,96,733,7.0,0.621951,0,0.0,0
9,8,Jarrett Allen,23,56,56,32.3,6.6,9.7,0.677,0.0,...,44,75,94,97,904,15.0,0.536585,0,0.0,0
10,9,Jose Alvarado,23,54,1,15.4,2.4,5.4,0.446,0.6,...,71,7,40,73,330,20.0,0.439024,0,0.0,0
11,10,Justin Anderson,28,16,6,19.8,2.3,5.9,0.379,0.9,...,8,6,8,22,102,0.0,0.0,0,0.0,0
