Exploration of How Social Media Can Predict Winning Metrics Better Than Salary

In [1]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
color = sns.color_palette()
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline

In [2]:
attendance_valuation_elo_df = pd.read_csv("../input/nba_2017_att_val_elo.csv");attendance_valuation_elo_df.head()

In [3]:
salary_df = pd.read_csv("../input/nba_2017_salary.csv");salary_df.head()


In [4]:
pie_df = pd.read_csv("../input/nba_2017_pie.csv");pie_df.head()

In [5]:
plus_minus_df = pd.read_csv("../input/nba_2017_real_plus_minus.csv");plus_minus_df.head()

In [6]:
br_stats_df = pd.read_csv("../input/nba_2017_br.csv");br_stats_df.head()

In [7]:
# split name of players in NAME column before comma, and append it to the table
# Change the title of WINS to WINS_RPM
plus_minus_df.rename(columns={"NAME":"PLAYER", "WINS": "WINS_RPM"}, inplace=True)
players = []
for player in plus_minus_df["PLAYER"]:
    plyr, _ = player.split(",")
    players.append(plyr)
plus_minus_df.drop(["PLAYER"], inplace=True, axis=1)
plus_minus_df["PLAYER"] = players
plus_minus_df.head()

In [8]:
# rename and drop columns, inner merge: inner join on the player with the same name
nba_players_df = br_stats_df.copy()
nba_players_df.rename(columns={'Player': 'PLAYER','Pos':'POSITION', 'Tm': "TEAM", 'Age': 'AGE', "PS/G": "POINTS"}, inplace=True)
nba_players_df.drop(["G", "GS", "TEAM"], inplace=True, axis=1)
nba_players_df = nba_players_df.merge(plus_minus_df, how="inner", on="PLAYER")
nba_players_df.head()

In [9]:
# inner join
pie_df_subset = pie_df[["PLAYER", "PIE", "PACE", "W"]].copy()
nba_players_df = nba_players_df.merge(pie_df_subset, how="inner", on="PLAYER")
nba_players_df.head()

In [10]:
# rename column, salary in million $, drop columns
salary_df.rename(columns={'NAME': 'PLAYER'}, inplace=True)
salary_df["SALARY_MILLIONS"] = round(salary_df["SALARY"]/1000000, 2)
salary_df.drop(["POSITION","TEAM", "SALARY"], inplace=True, axis=1)
salary_df.head()

In [11]:
# change to list type
diff = list(set(nba_players_df["PLAYER"].values.tolist()) - set(salary_df["PLAYER"].values.tolist()))

In [12]:
len(diff)


In [13]:

nba_players_with_salary_df = nba_players_df.merge(salary_df); 

In [38]:
# My EDA Process
nba_players_with_salary_df.describe()

In [39]:
nba_players_with_salary_df.isnull().sum()

In [43]:
# low corr, fill by average
nba_players_with_salary_df['3P%'].corr(nba_players_with_salary_df['SALARY_MILLIONS'])

In [54]:
# no null value now
avg_3p=nba_players_with_salary_df["3P%"].mean()
nba_players_with_salary_df['3P%']= nba_players_with_salary_df['3P%'].fillna(avg_3p)
nba_players_with_salary_df['3P%'].isnull().any()

In [55]:
# check uniqueness
nba_players_with_salary_df.nunique()

In [58]:

plt.subplots(figsize=(20,15))
ax = plt.axes()
ax.set_title("NBA Player Correlation Heatmap:  2016-2017 Season (STATS & SALARY)")
corr = nba_players_with_salary_df.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

In [103]:
# strength of corr- cov, select row with mean value
cov=nba_players_with_salary_df.cov()
cov_des=cov.describe()
cov_des.iloc[1]

In [112]:
# variables: Rk, MP, FGA, POINTS, GP, MPG; Scatter plot
sns.lmplot(x="SALARY_MILLIONS", y="Rk", data=nba_players_with_salary_df)
sns.lmplot(x="SALARY_MILLIONS", y="MP", data=nba_players_with_salary_df)
sns.lmplot(x="SALARY_MILLIONS", y="FGA", data=nba_players_with_salary_df)

In [116]:
sns.lmplot(x="SALARY_MILLIONS", y="POINTS", data=nba_players_with_salary_df)
sns.lmplot(x="SALARY_MILLIONS", y="GP", data=nba_players_with_salary_df)
sns.lmplot(x="SALARY_MILLIONS", y="MPG", data=nba_players_with_salary_df)

In [15]:
sns.lmplot(x="SALARY_MILLIONS", y="WINS_RPM", data=nba_players_with_salary_df)


In [16]:
results = smf.ols('W ~POINTS', data=nba_players_with_salary_df).fit()


In [17]:
print(results.summary())


In [18]:
results = smf.ols('W ~WINS_RPM', data=nba_players_with_salary_df).fit()


In [19]:
print(results.summary())


In [20]:
results = smf.ols('SALARY_MILLIONS ~POINTS', data=nba_players_with_salary_df).fit()


In [21]:
print(results.summary())


In [22]:
results = smf.ols('SALARY_MILLIONS ~WINS_RPM', data=nba_players_with_salary_df).fit()


In [23]:
print(results.summary())


In [24]:
from ggplot import *


In [25]:

p = ggplot(nba_players_with_salary_df,aes(x="POINTS", y="WINS_RPM", color="SALARY_MILLIONS")) + geom_point(size=200)
p + xlab("POINTS/GAME") + ylab("WINS/RPM") + ggtitle("NBA Players 2016-2017:  POINTS/GAME, WINS REAL PLUS MINUS and SALARY")

In [26]:
wiki_df = pd.read_csv("../input/nba_2017_player_wikipedia.csv");wiki_df.head()


In [27]:
wiki_df.rename(columns={'names': 'PLAYER', "pageviews": "PAGEVIEWS"}, inplace=True)


In [28]:
median_wiki_df = wiki_df.groupby("PLAYER").median()


In [29]:

median_wiki_df_small = median_wiki_df[["PAGEVIEWS"]]

In [30]:
median_wiki_df_small = median_wiki_df_small.reset_index()


In [31]:
nba_players_with_salary_wiki_df = nba_players_with_salary_df.merge(median_wiki_df_small)


In [32]:
twitter_df = pd.read_csv("../input/nba_2017_twitter_players.csv");twitter_df.head()


In [33]:
nba_players_with_salary_wiki_twitter_df = nba_players_with_salary_wiki_df.merge(twitter_df)


In [34]:
nba_players_with_salary_wiki_twitter_df.head()

In [35]:

plt.subplots(figsize=(20,15))
ax = plt.axes()
ax.set_title("NBA Player Correlation Heatmap:  2016-2017 Season (STATS & SALARY & TWITTER & WIKIPEDIA)")
corr = nba_players_with_salary_wiki_twitter_df.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)