Exploration of How Social Media Can Predict Winning Metrics Better Than Salary

In [None]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
color = sns.color_palette()
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline

In [None]:
attendance_valuation_elo_df = pd.read_csv("../input/nba_2017_att_val_elo.csv");attendance_valuation_elo_df.head()

In [None]:
salary_df = pd.read_csv("../input/nba_2017_salary.csv");salary_df.head()


In [None]:
pie_df = pd.read_csv("../input/nba_2017_pie.csv");pie_df.head()

In [None]:
plus_minus_df = pd.read_csv("../input/nba_2017_real_plus_minus.csv");plus_minus_df.head()

In [None]:
br_stats_df = pd.read_csv("../input/nba_2017_br.csv");br_stats_df.head()

In [None]:

plus_minus_df.rename(columns={"NAME":"PLAYER", "WINS": "WINS_RPM"}, inplace=True)
players = []
for player in plus_minus_df["PLAYER"]:
    plyr, _ = player.split(",")
    players.append(plyr)
plus_minus_df.drop(["PLAYER"], inplace=True, axis=1)
plus_minus_df["PLAYER"] = players
plus_minus_df.head()

In [None]:

nba_players_df = br_stats_df.copy()
nba_players_df.rename(columns={'Player': 'PLAYER','Pos':'POSITION', 'Tm': "TEAM", 'Age': 'AGE', "PS/G": "POINTS"}, inplace=True)
nba_players_df.drop(["G", "GS", "TEAM"], inplace=True, axis=1)
nba_players_df = nba_players_df.merge(plus_minus_df, how="inner", on="PLAYER")
nba_players_df.head()

In [None]:

pie_df_subset = pie_df[["PLAYER", "PIE", "PACE", "W"]].copy()
nba_players_df = nba_players_df.merge(pie_df_subset, how="inner", on="PLAYER")
nba_players_df.head()

In [None]:
salary_df.rename(columns={'NAME': 'PLAYER'}, inplace=True)
salary_df["SALARY_MILLIONS"] = round(salary_df["SALARY"]/1000000, 2)
salary_df.drop(["POSITION","TEAM", "SALARY"], inplace=True, axis=1)
salary_df.head()

In [None]:
diff = list(set(nba_players_df["PLAYER"].values.tolist()) - set(salary_df["PLAYER"].values.tolist()))

In [None]:
len(diff)


In [None]:
nba_players_with_salary_df = nba_players_df.merge(salary_df); 

In [None]:
plt.subplots(figsize=(20,15))
ax = plt.axes()
ax.set_title("NBA Player Correlation Heatmap:  2016-2017 Season (STATS & SALARY)")
corr = nba_players_with_salary_df.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

In [None]:
sns.lmplot(x="SALARY_MILLIONS", y="WINS_RPM", data=nba_players_with_salary_df)


In [None]:
results = smf.ols('W ~POINTS', data=nba_players_with_salary_df).fit()


In [None]:
print(results.summary())


In [None]:
results = smf.ols('W ~WINS_RPM', data=nba_players_with_salary_df).fit()


In [None]:
print(results.summary())


In [None]:
results = smf.ols('SALARY_MILLIONS ~POINTS', data=nba_players_with_salary_df).fit()


In [None]:
print(results.summary())


In [None]:
results = smf.ols('SALARY_MILLIONS ~WINS_RPM', data=nba_players_with_salary_df).fit()


In [None]:
print(results.summary())


In [None]:
from ggplot import *


In [None]:

p = ggplot(nba_players_with_salary_df,aes(x="POINTS", y="WINS_RPM", color="SALARY_MILLIONS")) + geom_point(size=200)
p + xlab("POINTS/GAME") + ylab("WINS/RPM") + ggtitle("NBA Players 2016-2017:  POINTS/GAME, WINS REAL PLUS MINUS and SALARY")

In [None]:
wiki_df = pd.read_csv("../input/nba_2017_player_wikipedia.csv");wiki_df.head()


In [None]:
wiki_df.rename(columns={'names': 'PLAYER', "pageviews": "PAGEVIEWS"}, inplace=True)


In [None]:
median_wiki_df = wiki_df.groupby("PLAYER").median()


In [None]:

median_wiki_df_small = median_wiki_df[["PAGEVIEWS"]]

In [None]:
median_wiki_df_small = median_wiki_df_small.reset_index()


In [None]:
nba_players_with_salary_wiki_df = nba_players_with_salary_df.merge(median_wiki_df_small)


In [None]:
twitter_df = pd.read_csv("../input/nba_2017_twitter_players.csv");twitter_df.head()


In [None]:
nba_players_with_salary_wiki_twitter_df = nba_players_with_salary_wiki_df.merge(twitter_df)


In [None]:
nba_players_with_salary_wiki_twitter_df.head()

In [None]:

plt.subplots(figsize=(20,15))
ax = plt.axes()
ax.set_title("NBA Player Correlation Heatmap:  2016-2017 Season (STATS & SALARY & TWITTER & WIKIPEDIA)")
corr = nba_players_with_salary_wiki_twitter_df.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

In this kernel, I want to find which factors most influence RPM. I will first show why RPM is an important stat to measure, then do some EDA to explore the factors most influence RPM, and thus, winning ballgames.

In [None]:
#next, we want to see RPM in a scatterplot against some of the other "objective" positive outcomes
#in the dataset, such as PIE, wins, and salary.

import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import pylab

import matplotlib.pyplot
import pylab

matplotlib.pyplot.scatter(nba_players_with_salary_df['RPM'],nba_players_with_salary_df['PIE'])
matplotlib.pyplot.show()

In [None]:
matplotlib.pyplot.scatter(nba_players_with_salary_df['RPM'],nba_players_with_salary_df['SALARY_MILLIONS'])
matplotlib.pyplot.show()

In [None]:
matplotlib.pyplot.scatter(nba_players_with_salary_df['WINS_RPM'],nba_players_with_salary_df['W'])
matplotlib.pyplot.show()

In [None]:
matplotlib.pyplot.scatter(nba_players_with_salary_df['W'],nba_players_with_salary_df['SALARY_MILLIONS'])
matplotlib.pyplot.show()

In [None]:
matplotlib.pyplot.scatter(nba_players_with_salary_df['RPM'],nba_players_with_salary_df['W'])
matplotlib.pyplot.show()

In [None]:
sns.lmplot(x="RPM", y="W", data=nba_players_with_salary_wiki_twitter_df)


In [None]:
sns.lmplot(x="RPM", y="W", data=nba_players_with_salary_wiki_twitter_df)

Interestingly, here we see that no player with an RPM above 2.5 has won less than about 30 games. But there are many players who have won less than 30 games, and all of them have an RPM under 2.5. So let's take a closer look at this subset of players to see if we can find anything interesting about them.

In [None]:
sum(i > 2.5 for i in nba_players_with_salary_df['RPM'])/len(nba_players_with_salary_df['RPM'])

In [None]:
sns.lmplot(x="RPM", y="W", 
           data=nba_players_with_salary_wiki_twitter_df[(nba_players_with_salary_wiki_twitter_df.RPM >= 2.5)])

In [None]:
#first we want to see the distribution of RPM in the dataset
#we see that very few players have an RPM over 4

import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import pylab

plt.hist(nba_players_with_salary_df['RPM'])
plt.ylabel("Distribution")
plt.xlabel("RPM")
plt.show()

In [None]:
#first we want to see the distribution of RPM in the dataset
#we see that very few players have an RPM over 4

import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import pylab

plt.hist(nba_players_with_salary_df['W'])
plt.ylabel("Distribution")
plt.xlabel("W")
plt.show()

From the above, we can see that RPM is highly correlated with wins. This is a significant result because it shows that good players are on good teams. We can determine causality, because there are only 30 teams and the best young players are drafted to the worst teams. So if a player is actually good, ie the player has a high RPM, they can elevate their team's number of wins. Let's take a deeper look to find out what factors most affect RPM, so teams can look for key traits in players that will ultimately help elevate their team's level of play.

In [None]:
nba_players_with_salary_wiki_twitter_df.dtypes

In [None]:
results = smf.ols('WINS_RPM ~RPM + W', data=nba_players_with_salary_df).fit()
print(results.summary())

In [None]:
results = smf.ols('W~ RPM', data=nba_players_with_salary_wiki_twitter_df).fit()
print(results.summary())

Now that we've established the correlation between winning and RPM, let's find out which player stats most affect RPM.

In [None]:
x = nba_players_with_salary_df.drop(['RPM','PLAYER','POSITION','TEAM','WINS_RPM', 'ORPM', 'DRPM', 'W'], axis = 1)
y = nba_players_with_salary_df['RPM']

results = smf.ols('y ~ x', data = nba_players_with_salary_df).fit()
print(results.summary())

In [None]:
list(x)

In [None]:
#the above result is very interesting because it says that assists, steals, blocks, and turnovers have the
#largest effect on RPM, but points and shooting don't really matter
#Let's run that regression

results = smf.ols('RPM ~ AST + STL + BLK + TOV', 
                  data=nba_players_with_salary_wiki_twitter_df).fit()
print(results.summary())

In [None]:
rpm_factors = nba_players_with_salary_wiki_twitter_df[['PLAYER','AST','TOV','STL','BLK', 'RPM', 'W']].copy()
rpm_factors.head()

In [None]:
plt.subplots(figsize=(20,15))
ax = plt.axes()
ax.set_title("NBA Player Correlation Heatmap:  Major Factors Affecting RPM)")
corr = rpm_factors.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

In [None]:
#Since TOV is no longer significant, let's take it out of the model

results = smf.ols('RPM ~ AST + STL + BLK', 
                  data=nba_players_with_salary_wiki_twitter_df).fit()
print(results.summary())

Since our R^2 barely changes when we take out TOV, we can now say we've narrowed down RPM to just 3 main factors: Assists, Steals, and Blocks! Let us look at each of these variables against RPM, with wins as color.

In [None]:
p = ggplot(nba_players_with_salary_df,aes(x="AST", y="RPM", color="W")) + geom_point(size=200)
p + xlab("AST") + ylab("RPM") + ggtitle("NBA Players 2016-2017:  Assists, Real Plus Minus, and Wins")

#even more linear than I expected!

In [None]:
p = ggplot(nba_players_with_salary_df,aes(x="STL", y="RPM", color="W")) + geom_point(size=200)
p + xlab("STL") + ylab("RPM") + ggtitle("NBA Players 2016-2017:  Steals, Real Plus Minus, and Wins")

In [None]:
p = ggplot(nba_players_with_salary_df,aes(x="BLK", y="RPM", color="W")) + geom_point(size=200)
p + xlab("BLK") + ylab("RPM") + ggtitle("NBA Players 2016-2017:  Blocks, Real Plus Minus, and Wins")

Let's create a new variable adding assists, steals, and blocks, and see how that correlates with RPM and Wins.

In [None]:
nba_players_with_salary_df['ASTSTLBLK'] = nba_players_with_salary_df['AST'] + nba_players_with_salary_df['STL'] + nba_players_with_salary_df['BLK']

p = ggplot(nba_players_with_salary_df,aes(x="ASTSTLBLK", y="RPM", color="W")) + geom_point(size=200)
p + xlab("ASTSTLBLK") + ylab("RPM") + ggtitle("NBA Players 2016-2017:  Steals, Real Plus Minus, and Wins")

In [None]:
results = smf.ols('RPM ~ASTSTLBLK', 
                  data=nba_players_with_salary_df).fit()
print(results.summary())

In [None]:
#Now let's look at aststlblk vs wins

results = smf.ols('W ~ASTSTLBLK', 
                  data=nba_players_with_salary_df).fit()
print(results.summary())

In [None]:
p = ggplot(nba_players_with_salary_df,aes(x="AGE", y="ASTSTLBLK", color="RPM")) + geom_point(size=200)
p + xlab("AGE") + ylab("ASTSTLBLK") + ggtitle("NBA Players 2016-2017:  Steals, Age, and Real Plus Minus")

In [None]:
# Number of clusters
k_means = KMeans(n_clusters=3)

# Choose the columns that the clusters will be based upon
cluster_source = nba_players_with_salary_df.loc[:,["RPM", "W", "ASTSTLBLK"]]

# Create the clusters
kmeans = k_means.fit(cluster_source)

# Create a column, 'cluster,' denoting the cluster classification of each row
nba_players_with_salary_df['cluster'] = kmeans.labels_

# Create a scatter plot with colors based on the cluster
ax = sns.lmplot(x="ASTSTLBLK", y="RPM", data=nba_players_with_salary_df,hue="cluster", size=12, fit_reg=False)
ax.set(xlabel='ASTSTLBLK', ylabel='RPM', title="NBA player Wikipedia ASTSTLBLK vs RPM clustered on ASTSTLBLK, W, RPM:  2016-2017 Season")

The cluster map above doesn't give us any meaningful results using the aststlblk variable, so let's go back to our central question of how to determine players with RPM > 2.5 with another method: Random Forest Classifier.

In [None]:
bins = [-10, 2.5, np.inf]
labels = ['Low', 'High']
nba_players_with_salary_df['High_RPM'] = pd.cut(nba_players_with_salary_df['RPM'],bins,labels=labels)
nba_players_with_salary_df['High_RPM'].value_counts()

In [None]:
nba_players_with_salary_df["3P%"] = np.where(nba_players_with_salary_df["3P%"].isnull(), 0, nba_players_with_salary_df["3P%"])
nba_players_with_salary_df["FT%"] = np.where(nba_players_with_salary_df["FT%"].isnull(), 0, nba_players_with_salary_df["FT%"])

In [None]:
print(pd.isnull(nba_players_with_salary_df).sum())

In [None]:
#tutorial reference: https://www.datacamp.com/community/tutorials/exploratory-data-analysis-python

X = nba_players_with_salary_df.iloc[:,6:25]
Y = nba_players_with_salary_df.iloc[:,-1]

from sklearn.ensemble import RandomForestClassifier

# Isolate Data, class labels and column values
names = X.columns.values

# Build the model
rfc = RandomForestClassifier()

# Fit the model
rfc.fit(X, Y)

# Print the results
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), rfc.feature_importances_), names), reverse=True))

The above feature importance shows that FTA, FGA, DRB, FT, and eFG% are also important variables in addition to assists, steals, and blocks.