Exploration of How Social Media Can Predict Winning Metrics Better Than Salary

### *Table of Content:*
### 1. Import Packages
### 2. Import & Aggregate Player Datasets
### 3. EDA (part I)
   #### 3.1 Correlation Heatmap
   #### 3.2 Salary vs. Wins_RPM
   #### 3.3 Scatter Plot with 3 dimensions
### 4. Import & Aggregate Social Media Datasets
### 5. EDA (part II)
   #### 5.1 Correlation Heatmap
   #### 5.2 More Analyses On Salary
   #### 5.3 Wiki Pageviews Analyses
   #### 5.4 Twitter Analyses

# 1. Import Packages 

In [None]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
color = sns.color_palette()
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline

 # 2. Import  & Aggregate Player Datasets 

In [None]:
attendance_valuation_elo_df = pd.read_csv("../input/nba_2017_att_val_elo.csv");attendance_valuation_elo_df.head()

In [None]:
salary_df = pd.read_csv("../input/nba_2017_salary.csv");salary_df.head()


In [None]:
pie_df = pd.read_csv("../input/nba_2017_pie.csv");pie_df.head()

In [None]:
plus_minus_df = pd.read_csv("../input/nba_2017_real_plus_minus.csv");plus_minus_df.head()

In [None]:
br_stats_df = pd.read_csv("../input/nba_2017_br.csv");br_stats_df.head()

In [None]:
# Remove Position Abbreviation from Name Field 
plus_minus_df.rename(columns={"NAME":"PLAYER", "WINS": "WINS_RPM"}, inplace=True)
players = []
for player in plus_minus_df["PLAYER"]:
    plyr, _ = player.split(",")
    players.append(plyr)
plus_minus_df.drop(["PLAYER"], inplace=True, axis=1)
plus_minus_df["PLAYER"] = players
plus_minus_df.head()

In [None]:

nba_players_df = br_stats_df.copy()
nba_players_df.rename(columns={'Player': 'PLAYER','Pos':'POSITION', 'Tm': "TEAM", 'Age': 'AGE', "PS/G": "POINTS"}, inplace=True)
nba_players_df.drop(["G", "GS", "TEAM"], inplace=True, axis=1)
nba_players_df = nba_players_df.merge(plus_minus_df, how="inner", on="PLAYER")
nba_players_df.head()

In [None]:

pie_df_subset = pie_df[["PLAYER", "PIE", "PACE", "W"]].copy()
nba_players_df = nba_players_df.merge(pie_df_subset, how="inner", on="PLAYER")
nba_players_df.head()

In [None]:
salary_df.rename(columns={'NAME': 'PLAYER'}, inplace=True)
salary_df["SALARY_MILLIONS"] = round(salary_df["SALARY"]/1000000, 2)
salary_df.drop(["POSITION","TEAM", "SALARY"], inplace=True, axis=1)
salary_df.head()

In [None]:
diff = list(set(nba_players_df["PLAYER"].values.tolist()) - set(salary_df["PLAYER"].values.tolist()))

In [None]:
len(diff)


In [None]:
# merge two dataframes
nba_players_with_salary_df = nba_players_df.merge(salary_df); 
nba_players_with_salary_df.head()

# 3. EDA (I)

## 3.1 Correlation Heatmap

In [None]:

plt.subplots(figsize=(20,15))
ax = plt.axes()
ax.set_title("NBA Player Correlation Heatmap:  2016-2017 Season (STATS & SALARY)")
corr = nba_players_with_salary_df.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
           cmap = "Greens")

## 3.2  Salary & Wins RPM Correlation

In [None]:
sns.lmplot(x="SALARY_MILLIONS", y="WINS_RPM", data=nba_players_with_salary_df)


In [None]:
# OLS Regression
results = smf.ols('W ~POINTS', data=nba_players_with_salary_df).fit()

In [None]:
# OLS Regression Output Summary
print(results.summary())


In [None]:
results = smf.ols('W ~WINS_RPM', data=nba_players_with_salary_df).fit()


In [None]:
print(results.summary())


In [None]:
results = smf.ols('SALARY_MILLIONS ~POINTS', data=nba_players_with_salary_df).fit()


In [None]:
print(results.summary())


In [None]:
results = smf.ols('SALARY_MILLIONS ~WINS_RPM', data=nba_players_with_salary_df).fit()


In [None]:
print(results.summary())


## 3.3 Scatter Plot with 3 Dimensions (points/game, wins rpm, salary)

In [None]:
from ggplot import *


In [None]:

p = ggplot(nba_players_with_salary_df,aes(x="POINTS", y="WINS_RPM", color="SALARY_MILLIONS")) + geom_point(size=200)
p + xlab("POINTS/GAME") + ylab("WINS/RPM") + ggtitle("NBA Players 2016-2017:  POINTS/GAME, WINS REAL PLUS MINUS and SALARY")

In [None]:
wiki_df = pd.read_csv("../input/nba_2017_player_wikipedia.csv");wiki_df.head()


# 4. Import and Aggregate Social Media Datasets

In [None]:
wiki_df.rename(columns={'names': 'PLAYER', "pageviews": "PAGEVIEWS"}, inplace=True)


In [None]:
median_wiki_df = wiki_df.groupby("PLAYER").median()


In [None]:

median_wiki_df_small = median_wiki_df[["PAGEVIEWS"]]

In [None]:
median_wiki_df_small = median_wiki_df_small.reset_index()


In [None]:
nba_players_with_salary_wiki_df = nba_players_with_salary_df.merge(median_wiki_df_small)


In [None]:
twitter_df = pd.read_csv("../input/nba_2017_twitter_players.csv");twitter_df.head()


In [None]:
nba_players_with_salary_wiki_twitter_df = nba_players_with_salary_wiki_df.merge(twitter_df)


In [None]:
nba_players_with_salary_wiki_twitter_df.head()

# 5. EDA (II)

## 5.1 Correlation Heatmap

In [None]:

plt.subplots(figsize=(20,15))
ax = plt.axes()
ax.set_title("NBA Player Correlation Heatmap:  2016-2017 Season (STATS & SALARY & TWITTER & WIKIPEDIA)")
corr = nba_players_with_salary_wiki_twitter_df.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

## 5.2 More Analyses on Salary  

In [None]:
nba_df = nba_players_with_salary_wiki_twitter_df; nba_df.head()

### 5.2.1 Salary vs. Age

In [None]:
sns.lmplot(x="AGE", y="SALARY_MILLIONS", data=nba_df)
plt.show()

### 5.2.2 Salary vs. Team

In [None]:
plt.figure(figsize=(10,15))
sns.boxplot(x="SALARY_MILLIONS", y="TEAM",data=nba_df)
plt.show()

### 5.2.3 Salary vs. Points 

In [None]:
sns.lmplot(x="POINTS", y="SALARY_MILLIONS", data=nba_df)
plt.show()

### 5.2.4 Salary vs. Games Played

In [None]:
sns.lmplot(x="GP", y="SALARY_MILLIONS", data=nba_df)
plt.show()

### 5.2.5 Salary vs. Endorsement

In [None]:
endorsement_df = pd.read_csv("../input/nba_2017_endorsements.csv")

# replace string into float
endorsement_df['SALARY'] = endorsement_df['SALARY'].str.replace(',' , '')
endorsement_df['SALARY'] = endorsement_df['SALARY'].str.replace('$', '')

endorsement_df['ENDORSEMENT'] = endorsement_df['ENDORSEMENT'].str.replace(',' , '')
endorsement_df['ENDORSEMENT'] = endorsement_df['ENDORSEMENT'].str.replace('$', '')

endorsement_df['SALARY'] = endorsement_df['SALARY'].astype(float)
endorsement_df['ENDORSEMENT'] = endorsement_df['ENDORSEMENT'].astype(float)

sns.lmplot(x="SALARY", y="ENDORSEMENT", data=endorsement_df)
plt.show()

## 5.3 Wiki Pageviews Analyses

### 5.3.1 Pageviews vs. Team

In [None]:
plt.figure(figsize=(10,15))
sns.boxplot(x="PAGEVIEWS", y="TEAM",data=nba_df)
plt.show()

### 5.3.2 Pageviews vs. Wins RPM

In [None]:
sns.lmplot(x="WINS_RPM", y="PAGEVIEWS", data=nba_df)
plt.show()

### 5.3.3 Pageviews vs. Points

In [None]:
sns.lmplot(x="POINTS", y="PAGEVIEWS", data=nba_df)
plt.show()

### 5.3.4 Pageviews vs. Salary

In [None]:
sns.lmplot(x="SALARY_MILLIONS", y="PAGEVIEWS", data=nba_df)
plt.show()

## 5.4 Twitter Analyses

### 5.4.1 Twitter Favortie Count(TFC) vs. Team

In [None]:
plt.figure(figsize=(10,15))
sns.boxplot(x="TWITTER_FAVORITE_COUNT", y="TEAM",data=nba_df)
plt.show()

### 5.4.2 TFC vs. Wins RPM

In [None]:
sns.lmplot(x="WINS_RPM", y="TWITTER_FAVORITE_COUNT", data=nba_df)
plt.show()

### 5.4.3 TFC vs. Points

In [None]:
sns.lmplot(x="POINTS", y="TWITTER_FAVORITE_COUNT", data=nba_df)
plt.show()

### 5.4.4 TFC vs. Salary

In [None]:
sns.lmplot(x="SALARY_MILLIONS", y="TWITTER_FAVORITE_COUNT", data=nba_df)
plt.show()

### 5.4.5 Twitter Favorite Count vs. Twitter Retweet Count

In [None]:
sns.lmplot(x="TWITTER_FAVORITE_COUNT", y="TWITTER_RETWEET_COUNT", data=nba_df)
plt.show()