In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import and look through the data

Import the data and see all the stats that are availble.

In [None]:
nba=pd.read_csv('/kaggle/input/nba-mvp-votings-through-history/mvp_votings.csv')
nba.columns

In [None]:
#view all of the columns in the dataframe
pd.set_option('display.max_columns', None)
nba

In one big dataframe, all players receiving mvp votes are included by year.

# Annotate the data

Can use the .idxmax() method to get the index for the max value in a column. We will use this to create a yes/no column indicating who won the mvp.

In [None]:
#check methodology of getting index of player

#data for 2017-18 season
ex=nba[nba['season'].isin(['2017-18'])]

#get index of mvp winner
index=[ex['win_pct'].idxmax()]

#now see who the player is by calling the index of the 'player' column
ex['player'][index]

In [None]:
nba[nba['season'].isin(['2015-16']) & nba['player'].isin(['LeBron James'])].index[0]

In [None]:
#set whole column to 'No', then just change to 'Yes' for mvp winners
nba['Mvp?']='No'

#for every season
for season in nba['season'].value_counts().index:
    
    #isolate data from that season
    season_df=nba[nba['season'].isin([season])]
    
    #get the index of player with most mvp points
    index=[season_df['points_won'].idxmax()]
    
    #change player's 'Mvp?' entry to yes
    nba['Mvp?'][index]='Yes'

In [None]:
nba

In [None]:
#move this new column next to mvp voting data

#save column,remove it from dataframe, then insert it where we want it
save=nba['Mvp?']
nba.drop(labels=['Mvp?'], axis=1, inplace = True)
nba.insert(10, 'Mvp?', save)
nba

Don't have to do this or understand all those steps, I just wanted to make it easier to see who won the mvp each season.

In [None]:
nba['Mvp?'].value_counts()

In [None]:
len(nba['season'].value_counts())

Verify that we have an mvp for each season.

# Trying out the first model

This first model will model each season separately, hence the for loop going though each individual season. I selected what I initially thought would be the most relevant criteria for determining who wins the mvp.

If the machine learning code is confusing, then I suggest you go to the building a machine learning model notebook and/or the interpreting a machine learning model notebook.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

#blank dataframe that we will add to
predicted_df=pd.DataFrame()

#create model for each season
for season in nba['season'].value_counts().index:
    
    #isolate season data
    season_df=nba[nba['season'].isin([season])]
    y=season_df['award_share']
    features=['per', 'ts_pct', 'usg_pct', 'g', 'mp_per_g', 'pts_per_g', 'trb_per_g',
       'ast_per_g', 'stl_per_g', 'blk_per_g', 'fg_pct', 'fg3_pct', 'ft_pct',
       'ws', 'ws_per_48','win_pct']
    X=season_df[features]
    train_X, val_X, train_y, val_y = train_test_split(X, y,random_state=1, test_size=0.4)
    basic_model = DecisionTreeRegressor(random_state=1)
    basic_model.fit(train_X, train_y)
    predictions=basic_model.predict(val_X)
    
    #modify test dataframe to show predictions too
    val_Xdf=val_X
    
    #add column of predictions
    val_Xdf['Prediction']=predictions
    
    #add the correct values
    val_Xdf['award_share']=val_y
    
    #add column for the season
    val_Xdf['season']=season
    
    #add column for player name- this is a bit tricky because we need the index of player as it is in the 'nba' dataframe
    #resetting index creates a column of the original indices that we can use to refer to the indices in the 'nba' dataframe
    val_Xdf['player']=[season_df['player'][index] for index in val_Xdf.reset_index()['index']]
    
    #same methodology here
    val_Xdf['Mvp?']=[season_df['Mvp?'][index] for index in val_Xdf.reset_index()['index']]
    
    #add this dataframe to the dataframe of all the seasons' predictions
    predicted_df=predicted_df.append(val_Xdf)

In [None]:
predicted_df

Here you can see all of the factors influencing the model, the actual award share, the predicted award share, and the column indicating whether or not the player won the mvp for that year. 

In [None]:
predicted_df[predicted_df['season'].isin(['2017-18'])]

When I isolate one of the years, I notice that I only see 6 players from that year. This is problematic because we should not test the model on a subset of mvp candidates. This is due to how the data was split into training and testing data. Sci-kit learn will randomly select testing data without considering what year each testing point is from. We need to manually split the data to ensure that each year is kept in tact.

In [None]:
features=['per', 'ts_pct', 'usg_pct', 'bpm', 'g', 'mp_per_g', 'pts_per_g', 'trb_per_g',
       'ast_per_g', 'stl_per_g', 'blk_per_g', 'fg_pct', 'fg3_pct', 'ft_pct',
       'ws', 'ws_per_48','win_pct']

#have to specify train test split so that we can group seasons together
#make first 30 seasons the training data and the last 8 the testing data
training_seasons=['1980-81', '1981-82', '1984-85', '1982-83', '1998-99', '1996-97',
       '1990-91', '1997-98', '1988-89', '2001-02', '1985-86', '2000-01',
       '2007-08', '1991-92', '1993-94', '2006-07', '1986-87', '1995-96',
       '1987-88', '2013-14', '1999-00', '2012-13', '2004-05', '2003-04',
       '1994-95', '2011-12', '2009-10', '1983-84', '1989-90', '1992-93']
testing_seasons=['2017-18', '2010-11', '2002-03', '2014-15', '2008-09', '2005-06',
       '2016-17', '2015-16']

#training data
training_data=nba[nba['season'].isin(training_seasons)]
train_X=training_data[features]
train_y=training_data['award_share']

#testing data
testing_data=nba[nba['season'].isin(testing_seasons)]
val_X=testing_data[features]
val_y=testing_data['award_share']

basic_model = DecisionTreeRegressor(random_state=1)
basic_model.fit(train_X, train_y)
predictions=basic_model.predict(val_X)

Manually splitting the data allows for full seasons to be kept intact.

In [None]:
#put testing data and predictions into new dataframe
predicted_df=pd.DataFrame()
val_Xdf=pd.DataFrame(val_X)
val_Xdf['Prediction']=predictions
val_Xdf['award_share']=val_y
val_Xdf['season']=[nba['season'][index] for index in val_Xdf.reset_index()['index']]
val_Xdf['player']=[nba['player'][index] for index in val_Xdf.reset_index()['index']]
val_Xdf['Mvp?']=[nba['Mvp?'][index] for index in val_Xdf.reset_index()['index']]
predicted_df=predicted_df.append(val_Xdf)

In [None]:
predicted_df

Add who we think would've won the mvp based on our predicted numbers. This is the same methodology as adding the 'Mvp?' column earlier in the notebook.

In [None]:
#create column indicating whether player actually won the mvp
predicted_df['Mvp prediction']='No'
for season in predicted_df['season'].value_counts().index:
    season_df=predicted_df[predicted_df['season'].isin([season])]
    index=season_df['Prediction'].idxmax()
    mvp=predicted_df['player'][index]
    
    #will only change for the mvp winner, otherwise all others players will be 'no'
    predicted_df['Mvp prediction'][index]='Yes'

In [None]:
predicted_df

# How to evaluate the model

Let's find a season where we incorrectly predicted the mvp. Just by seeing the preview of the dataframe and knowing the Tim Duncan won the 2002-03 mvp, let's see what went wrong in that year's prediction.

In [None]:
predicted_df[predicted_df['season'].isin(['2002-03'])]

Compare Dirk Nowitzki to the true mvp, Tim Duncan and see if the model's prediction is realistic (Dirk actually finished 7th in mvp voting...)

In [None]:
predicted_df[predicted_df['season'].isin(['2002-03']) & predicted_df['player'].isin(['Tim Duncan','Dirk Nowitzki'])]

This is interesting because Dirk actually has some impressive stats and when you look at the data used for the model

In [None]:
predicted_list=[]
for season in predicted_df['season'].value_counts().index:
    season_df=predicted_df[predicted_df['season'].isin([season])]
    predicted_list.append(season_df)

In [None]:
total=0
for df in predicted_list:
    if df['Mvp?'].equals(df['Mvp prediction'])==True:
        total+=1
total/len(predicted_list)

Only got half of the predictions right- let's look at the ones we got wrong and the ones we got right.

In [None]:
wrong_seasons=[]
right_seasons=[]
for df in predicted_list:
    if df['Mvp?'].equals(df['Mvp prediction'])==False:
        wrong_seasons.append(df.reset_index()['season'][0])
    else:
        right_seasons.append(df.reset_index()['season'][0])

In [None]:
wrong_seasons

In [None]:
predicted_df[predicted_df['season'].isin(['2017-18'])]

This one was way off because James Harden was almost the unanimous winner, but was predicted to have just a 7% award share.

In [None]:
predicted_df[predicted_df['season'].isin(['2010-11'])]

In [None]:
predicted_df[predicted_df['season'].isin(['2005-06'])]

The actual mvp winners are getting very small award share predictions. Even if an algorithm doesn't think they'll winmvp, they shouldn't finish 8th or barely get an award share.

In [None]:
right_seasons

In [None]:
predicted_df[predicted_df['season'].isin(['2015-16'])]

Let's try some new models. The criteria I used for that above model included all of the stats that I deemed relevant or most important. These are stats I commonly see cited when experts debate the mvp race. I'm going to create a new model that is all about team success- how your stats correlate with winning.

In [None]:
nba.columns

In [None]:
features=['bpm', 'g', 'mp_per_g','ws', 'ws_per_48','win_pct']
#have to specify train test split so that we can group seasons together
#make first 30 seasons the training data and the last 8 the testing data
training_seasons=['1980-81', '1981-82', '1984-85', '1982-83', '1998-99', '1996-97',
       '1990-91', '1997-98', '1988-89', '2001-02', '1985-86', '2000-01',
       '2007-08', '1991-92', '1993-94', '2006-07', '1986-87', '1995-96',
       '1987-88', '2013-14', '1999-00', '2012-13', '2004-05', '2003-04',
       '1994-95', '2011-12', '2009-10', '1983-84', '1989-90', '1992-93']
testing_seasons=['2017-18', '2010-11', '2002-03', '2014-15', '2008-09', '2005-06',
       '2016-17', '2015-16']
# train_X, val_X, train_y, val_y = train_test_split(X, y,random_state=1, test_size=0.4)
train_X=nba[nba['season'].isin(training_seasons)][features]
train_y=nba[nba['season'].isin(training_seasons)]['award_share']
val_X=nba[nba['season'].isin(testing_seasons)][features]
val_y=nba[nba['season'].isin(testing_seasons)]['award_share']

basic_model = DecisionTreeRegressor(random_state=1)
basic_model.fit(train_X, train_y)
predictions=basic_model.predict(val_X)

In [None]:
predicted_df=pd.DataFrame()
val_Xdf=pd.DataFrame(val_X)
val_Xdf['Prediction']=predictions
val_Xdf['award_share']=val_y
val_Xdf['season']=[nba['season'][index] for index in val_Xdf.reset_index()['index']]
val_Xdf['player']=[nba['player'][index] for index in val_Xdf.reset_index()['index']]
val_Xdf['Mvp?']=[nba['Mvp?'][index] for index in val_Xdf.reset_index()['index']]
predicted_df=predicted_df.append(val_Xdf)

In [None]:
#create column indicating whether player actually won the mvp
predicted_df['Mvp prediction']=''
for season in predicted_df['season'].value_counts().index:
    season_df=predicted_df[predicted_df['season'].isin([season])]
    mvp=predicted_df['player'][season_df['Prediction'].idxmax()]
    for player in season_df['player']:
        row=predicted_df[predicted_df['season'].isin([season]) & predicted_df['player'].isin([player])].index[0]
        if player==mvp:
            predicted_df['Mvp prediction'][row]='Yes'
        else:
            predicted_df['Mvp prediction'][row]='No'

In [None]:
predicted_list=[]
for season in predicted_df['season'].value_counts().index:
    season_df=predicted_df[predicted_df['season'].isin([season])]
    predicted_list.append(season_df)

In [None]:
wrong_seasons=[]
right_seasons=[]
for df in predicted_list:
    if df['Mvp?'].equals(df['Mvp prediction'])==False:
        wrong_seasons.append(df.reset_index()['season'][0])
    else:
        right_seasons.append(df.reset_index()['season'][0])

In [None]:
wrong_seasons

Only one of these was also predicted wrong in the last model. 5/8 is still better than last time though.

In [None]:
predicted_df[predicted_df['season'].isin(['2005-06'])]

New model that is pure based on player output- team success not taken into account.

In [None]:
nba.columns

In [None]:
features=['fga', 'fg3a', 'fta', 'per', 'ts_pct', 'usg_pct',
       'g', 'mp_per_g', 'pts_per_g', 'trb_per_g',
       'ast_per_g', 'stl_per_g', 'blk_per_g', 'fg_pct', 'fg3_pct', 'ft_pct']
#have to specify train test split so that we can group seasons together
#make first 30 seasons the training data and the last 8 the testing data
training_seasons=['1980-81', '1981-82', '1984-85', '1982-83', '1998-99', '1996-97',
       '1990-91', '1997-98', '1988-89', '2001-02', '1985-86', '2000-01',
       '2007-08', '1991-92', '1993-94', '2006-07', '1986-87', '1995-96',
       '1987-88', '2013-14', '1999-00', '2012-13', '2004-05', '2003-04',
       '1994-95', '2011-12', '2009-10', '1983-84', '1989-90', '1992-93']
testing_seasons=['2017-18', '2010-11', '2002-03', '2014-15', '2008-09', '2005-06',
       '2016-17', '2015-16']
# train_X, val_X, train_y, val_y = train_test_split(X, y,random_state=1, test_size=0.4)
train_X=nba[nba['season'].isin(training_seasons)][features]
train_y=nba[nba['season'].isin(training_seasons)]['award_share']
val_X=nba[nba['season'].isin(testing_seasons)][features]
val_y=nba[nba['season'].isin(testing_seasons)]['award_share']

basic_model = DecisionTreeRegressor(random_state=1)
basic_model.fit(train_X, train_y)
predictions=basic_model.predict(val_X)

In [None]:
predicted_df=pd.DataFrame()
val_Xdf=pd.DataFrame(val_X)
val_Xdf['Prediction']=predictions
val_Xdf['award_share']=val_y
val_Xdf['season']=[nba['season'][index] for index in val_Xdf.reset_index()['index']]
val_Xdf['player']=[nba['player'][index] for index in val_Xdf.reset_index()['index']]
val_Xdf['Mvp?']=[nba['Mvp?'][index] for index in val_Xdf.reset_index()['index']]
predicted_df=predicted_df.append(val_Xdf)

In [None]:
#create column indicating whether player actually won the mvp
predicted_df['Mvp prediction']=''
for season in predicted_df['season'].value_counts().index:
    season_df=predicted_df[predicted_df['season'].isin([season])]
    mvp=predicted_df['player'][season_df['Prediction'].idxmax()]
    for player in season_df['player']:
        row=predicted_df[predicted_df['season'].isin([season]) & predicted_df['player'].isin([player])].index[0]
        if player==mvp:
            predicted_df['Mvp prediction'][row]='Yes'
        else:
            predicted_df['Mvp prediction'][row]='No'

In [None]:
predicted_list=[]
for season in predicted_df['season'].value_counts().index:
    season_df=predicted_df[predicted_df['season'].isin([season])]
    predicted_list.append(season_df)

In [None]:
wrong_seasons=[]
right_seasons=[]
for df in predicted_list:
    if df['Mvp?'].equals(df['Mvp prediction'])==False:
        wrong_seasons.append(df.reset_index()['season'][0])
    else:
        right_seasons.append(df.reset_index()['season'][0])

In [None]:
wrong_seasons

In [None]:
predicted_df[predicted_df['season'].isin(['2005-06'])]

What if instead of determining award share as a number, we just use the metric of whether or not they won the mvp? The 'Mvp?' column is currently filled with yes/no strings, but I could replace it with true/false booleans that will allow for a model to be created (can't used strings in this ML model).

In [None]:
for row in range(len(nba)):
    if nba['Mvp?'][row]=='Yes':
        nba['Mvp?'][row]=True
    else:
        nba['Mvp?'][row]=False
nba['Mvp?']

In [None]:
nba['Mvp?'].value_counts()

Let's try this out on our most successful model, which was based mainly on team success.

In [None]:
nba[nba['season'].isin(['1980-81'])]

In [None]:
features=['bpm', 'g', 'mp_per_g','ws', 'ws_per_48','win_pct']
#have to specify train test split so that we can group seasons together
#make first 30 seasons the training data and the last 8 the testing data
training_seasons=['1980-81', '1981-82', '1984-85', '1982-83', '1998-99', '1996-97',
       '1990-91', '1997-98', '1988-89', '2001-02', '1985-86', '2000-01',
       '2007-08', '1991-92', '1993-94', '2006-07', '1986-87', '1995-96',
       '1987-88', '2013-14', '1999-00', '2012-13', '2004-05', '2003-04',
       '1994-95', '2011-12', '2009-10', '1983-84', '1989-90', '1992-93']
testing_seasons=['2017-18', '2010-11', '2002-03', '2014-15', '2008-09', '2005-06',
       '2016-17', '2015-16']
# train_X, val_X, train_y, val_y = train_test_split(X, y,random_state=1, test_size=0.4)
train_X=nba[nba['season'].isin(training_seasons)][features]
train_y=nba[nba['season'].isin(training_seasons)]['Mvp?']
val_X=nba[nba['season'].isin(testing_seasons)][features]
val_y=nba[nba['season'].isin(testing_seasons)]['Mvp?']

basic_model = DecisionTreeRegressor(random_state=1)
basic_model.fit(train_X, train_y)
predictions=basic_model.predict(val_X)

In [None]:
predicted_df=pd.DataFrame()
val_Xdf=pd.DataFrame(val_X)
val_Xdf['Prediction']=predictions
val_Xdf['Mvp?']=val_y
val_Xdf['season']=[nba['season'][index] for index in val_Xdf.reset_index()['index']]
val_Xdf['player']=[nba['player'][index] for index in val_Xdf.reset_index()['index']]
# val_Xdf['Mvp?']=[nba['Mvp?'][index] for index in val_Xdf.reset_index()['index']]
predicted_df=predicted_df.append(val_Xdf)

In [None]:
predicted_df

In [None]:
#create column indicating whether player actually won the mvp
predicted_df['Mvp prediction']=''
for index in predicted_df.reset_index()['index']:
    if predicted_df['Prediction'][index]==True:
        predicted_df['Mvp prediction'][index]='Yes'
    else:
        predicted_df['Mvp prediction'][index]='No'

In [None]:
predicted_df

For this model, any given year is not constrained to one mvp winner.

In [None]:
predicted_df['Mvp prediction'].value_counts()

For 8 years, there are 9 predicted winners. This is not far off. Let's see how many years I got correct.

In [None]:
predicted_list=[]
for season in predicted_df['season'].value_counts().index:
    season_df=predicted_df[predicted_df['season'].isin([season])]
    predicted_list.append(season_df)

In [None]:
#can't compare boolean to integer columns, can only compare one by one
#have to see if any cell doesn't match up
wrong_seasons=[]
right_seasons=[]
for df in predicted_list:
    df=df.reset_index()
    for row in range(len(df)):
        if df['Mvp?'][row]!=df['Prediction'][row]:
            wrong_seasons.append(df['season'][row])
#         wrong_seasons.append(df.reset_index()['season'][0])
#     else:
#         right_seasons.append(df.reset_index()['season'][0])

In [None]:
wrong_seasons

5/8 were incorrect

In [None]:
predicted_df[predicted_df['season'].isin(['2015-16'])]

Let's try it with the player output method.

In [None]:
features=['fga', 'fg3a', 'fta', 'per', 'ts_pct', 'usg_pct',
       'g', 'mp_per_g', 'pts_per_g', 'trb_per_g',
       'ast_per_g', 'stl_per_g', 'blk_per_g', 'fg_pct', 'fg3_pct', 'ft_pct']
training_seasons=['1980-81', '1981-82', '1984-85', '1982-83', '1998-99', '1996-97',
       '1990-91', '1997-98', '1988-89', '2001-02', '1985-86', '2000-01',
       '2007-08', '1991-92', '1993-94', '2006-07', '1986-87', '1995-96',
       '1987-88', '2013-14', '1999-00', '2012-13', '2004-05', '2003-04',
       '1994-95', '2011-12', '2009-10', '1983-84', '1989-90', '1992-93']
testing_seasons=['2017-18', '2010-11', '2002-03', '2014-15', '2008-09', '2005-06',
       '2016-17', '2015-16']
# train_X, val_X, train_y, val_y = train_test_split(X, y,random_state=1, test_size=0.4)
train_X=nba[nba['season'].isin(training_seasons)][features]
train_y=nba[nba['season'].isin(training_seasons)]['Mvp?']
val_X=nba[nba['season'].isin(testing_seasons)][features]
val_y=nba[nba['season'].isin(testing_seasons)]['Mvp?']

basic_model = DecisionTreeRegressor(random_state=1)
basic_model.fit(train_X, train_y)
predictions=basic_model.predict(val_X)

In [None]:
predicted_df=pd.DataFrame()
val_Xdf=pd.DataFrame(val_X)
val_Xdf['Prediction']=predictions
val_Xdf['Mvp?']=val_y
val_Xdf['season']=[nba['season'][index] for index in val_Xdf.reset_index()['index']]
val_Xdf['player']=[nba['player'][index] for index in val_Xdf.reset_index()['index']]
predicted_df=predicted_df.append(val_Xdf)

In [None]:
predicted_df['Mvp prediction']=''
for index in predicted_df.reset_index()['index']:
    if predicted_df['Prediction'][index]==True:
        predicted_df['Mvp prediction'][index]='Yes'
    else:
        predicted_df['Mvp prediction'][index]='No'

In [None]:
predicted_list=[]
for season in predicted_df['season'].value_counts().index:
    season_df=predicted_df[predicted_df['season'].isin([season])]
    predicted_list.append(season_df)

In [None]:
wrong_seasons=[]
for df in predicted_list:
    df=df.reset_index()
    for row in range(len(df)):
        if df['Mvp?'][row]!=df['Prediction'][row]:
            wrong_seasons.append(df['season'][row])

In [None]:
wrong_seasons

Only 1/8 predicted correctly.

In [None]:
predicted_df[predicted_df['season'].isin(['2002-03'])]

Let's try one more metric- first place votes. This should weed out the lower tier players from having much influence on the algorithm. Usually only three of four players will get any first place votes in a given year.

Create some methods so that it's easier to mimic these models.

In [None]:
def predict_model(features,metric):
    training_seasons=['1980-81', '1981-82', '1984-85', '1982-83', '1998-99', '1996-97',
           '1990-91', '1997-98', '1988-89', '2001-02', '1985-86', '2000-01',
           '2007-08', '1991-92', '1993-94', '2006-07', '1986-87', '1995-96',
           '1987-88', '2013-14', '1999-00', '2012-13', '2004-05', '2003-04',
           '1994-95', '2011-12', '2009-10', '1983-84', '1989-90', '1992-93']
    testing_seasons=['2017-18', '2010-11', '2002-03', '2014-15', '2008-09', '2005-06',
           '2016-17', '2015-16']
    train_X=nba[nba['season'].isin(training_seasons)][features]
    train_y=nba[nba['season'].isin(training_seasons)][metric]
    val_X=nba[nba['season'].isin(testing_seasons)][features]
    val_y=nba[nba['season'].isin(testing_seasons)][metric]

    basic_model = DecisionTreeRegressor(random_state=1)
    basic_model.fit(train_X, train_y)
    return basic_model.predict(val_X)

In [None]:
def get_val_X(features):
    return nba[nba['season'].isin(testing_seasons)][features]

In [None]:
def get_val_y(metric):
    return nba[nba['season'].isin(testing_seasons)][metric]

In [None]:
def get_df(predictions,val_X,val_y):
    predicted_df=pd.DataFrame()
    val_Xdf=pd.DataFrame(val_X)
    val_Xdf['Prediction']=predictions
    val_Xdf['Mvp?']=[nba['Mvp?'][index] for index in val_Xdf.reset_index()['index']]
    val_Xdf['season']=[nba['season'][index] for index in val_Xdf.reset_index()['index']]
    val_Xdf['player']=[nba['player'][index] for index in val_Xdf.reset_index()['index']]
    return predicted_df.append(val_Xdf)

In [None]:
def create_list(df):
    predicted_list=[]
    for season in df['season'].value_counts().index:
        season_df=df[df['season'].isin([season])]
        predicted_list.append(season_df)
    return predicted_list

In [None]:
features=['bpm', 'g', 'mp_per_g','ws', 'ws_per_48','win_pct']
predictions=predict_model(features,'votes_first')
p=get_df(predictions,get_val_X(features),get_val_y('votes_first'))
p

In [None]:
#create empty 'Mvp prediction' column that can be modified
p['Mvp prediction']='No'

#for every season
for season in p['season'].value_counts().index:
    
    #isolate data from that season, reset index
    season_df=p[p['season'].isin([season])]
    
    #find index player with most first place votes
    winner=season_df['Prediction'].idxmax()
    
    #go through indices of full dataframe by calling 'index' column
    p['Mvp prediction'][winner]='Yes'          

In [None]:
p

In [None]:
list=create_list(p)
wrong_seasons=[]
right_seasons=[]
for df in list:
    if df[df['Mvp?'].isin([True])].reset_index()['player'][0]==df[df['Mvp prediction'].isin(['Yes'])].reset_index()['player'][0]:
        right_seasons.append(df.reset_index()['season'][0])
    else:
        wrong_seasons.append(df.reset_index()['season'][0])

In [None]:
wrong_seasons

4/8 correct. let's try this with the player output strategy.

In [None]:
def get_season(df,season):
    return df[df['season'].isin([season])]

In [None]:
features=['fga', 'fg3a', 'fta', 'per', 'ts_pct', 'usg_pct',
       'g', 'mp_per_g', 'pts_per_g', 'trb_per_g',
       'ast_per_g', 'stl_per_g', 'blk_per_g', 'fg_pct', 'fg3_pct', 'ft_pct']
predictions=predict_model(features,'votes_first')
p=get_df(predictions,get_val_X(features),get_val_y('votes_first'))
p['Mvp prediction']='No'
for season in p['season'].value_counts().index:
    season_df=p[p['season'].isin([season])]
    winner=season_df['Prediction'].idxmax()
    p['Mvp prediction'][winner]='Yes'
list=create_list(p)
wrong_seasons=[]
right_seasons=[]
for df in list:
    if df[df['Mvp?'].isin([True])].reset_index()['player'][0]==df[df['Mvp prediction'].isin(['Yes'])].reset_index()['player'][0]:
        right_seasons.append(df.reset_index()['season'][0])
    else:
        wrong_seasons.append(df.reset_index()['season'][0])

In [None]:
wrong_seasons

5/8 correct.