In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

First, we need to import some datasets. I need to **Salaries** dataset for data about players and their salaries, the **CollegePlaying** dataset for data about what college a player went to, and then the **Schools** dataset for data about different colleges.

In [2]:
## get datasets
salaries = pd.read_csv('./files/Salaries.csv')
schools = pd.read_csv('./files/Schools.csv')
college = pd.read_csv('./files/CollegePlaying.csv')

Now it's time to inspect them.

In [3]:
schools.head()

Unnamed: 0,schoolID,name_full,city,state,country
0,abilchrist,Abilene Christian University,Abilene,TX,USA
1,adelphi,Adelphi University,Garden City,NY,USA
2,adrianmi,Adrian College,Adrian,MI,USA
3,akron,University of Akron,Akron,OH,USA
4,alabama,University of Alabama,Tuscaloosa,AL,USA


In [4]:
salaries.head()

Unnamed: 0,yearID,teamID,lgID,playerID,salary
0,1985,ATL,NL,barkele01,870000
1,1985,ATL,NL,bedrost01,550000
2,1985,ATL,NL,benedbr01,545000
3,1985,ATL,NL,campri01,633333
4,1985,ATL,NL,ceronri01,625000


In [5]:
college.head()

Unnamed: 0,playerID,schoolID,yearID
0,aardsda01,pennst,2001
1,aardsda01,rice,2002
2,aardsda01,rice,2003
3,abadan01,gamiddl,1992
4,abadan01,gamiddl,1993


<h1> The Overall Average and Median Career Salaries by Most Popular College and Most Popular State for College </h1>

It looks like in order to figure out which college a player went to and which state that college is in, I need to ***merge*** the Salaries dataset (**salaries**) with the CollegePlaying dataset (**college**) by the **playerID** field.

Then I'd need to merge *again* in order to combine this new dataset, **salaries_college** with the Schools dataset (**schools**) on the **schoolID** to bring in what state a college is in, and I'll call this final dataset **salaries_college_state**

In [6]:
## bring in college to player salary DataFrame
salaries_college = pd.merge(salaries,college.iloc[:,:2], on='playerID')

## bring in college state into new DataFrame
salaries_college_state = pd.merge(salaries_college,schools[['schoolID','state','name_full']],on='schoolID')

## inspect DataFrame
salaries_college_state.head()

Unnamed: 0,yearID,teamID,lgID,playerID,salary,schoolID,state,name_full
0,1985,ATL,NL,bedrost01,550000,newhaven,CT,University of New Haven
1,1986,PHI,NL,bedrost01,650000,newhaven,CT,University of New Haven
2,1987,PHI,NL,bedrost01,1050000,newhaven,CT,University of New Haven
3,1988,PHI,NL,bedrost01,925000,newhaven,CT,University of New Haven
4,1989,PHI,NL,bedrost01,1450000,newhaven,CT,University of New Haven


Next, to see the most popular colleges in this dataset, I need to group the records by schoolID and then count how many records appear in each group.

In [7]:
## get count of records grouped by schools and sort in descending order
popular_schools = salaries_college_state['schoolID'].groupby(salaries_college_state['schoolID']).value_counts().sort_values(ascending = False)

## view top 10 most popular schools
popular_schools.head(10)

schoolID   schoolID 
ucla       ucla         718
usc        usc          655
arizonast  arizonast    632
stanford   stanford     595
lsu        lsu          592
miamifl    miamifl      492
floridast  floridast    487
texas      texas        477
calstfull  calstfull    471
gatech     gatech       446
Name: schoolID, dtype: int64

But wait. Unfortunately, this is the total count of *ALL* schoolID's in the dataset, but it is *NOT* accounting for multiple records of the *same player*, and therefore I've got duplicate counts. In other words, I've got multiple records for the same playerID if they played for multiple years, and for each year they played, their alma mater counted *again* when it should only be counted *once*.

I need to group records by player and *then* group them by college and count *these* values to get the counts of *unique* players from each college. I'm also going to start using **name_full** instead of schoolID in order to make it more digestable for readers.

I can then do this same process but for the states in which colleges are located by grouping records by state after grouping by player.

Now, I need to get grab out the mean career salaries of each player and also include the college and college state of those players.

In [22]:
#grouped_salaries_college['playerID'].head()
grouped_salaries_college_df = pd.DataFrame({
    'playerID': grouped_salaries_college['playerID'].head(),
    'name_full': grouped_salaries_college['name_full'].head(),
    'salary': grouped_salaries_college['salary'].head()
})

grouped_salaries_state_df = pd.DataFrame({
    'playerID': grouped_salaries_state['playerID'].head(),
    'state': grouped_salaries_state['state'].head(),
    'salary': grouped_salaries_state['salary'].head()
})

print(grouped_salaries_college_df.head(),' ',len(grouped_salaries_college_df))
print(grouped_salaries_state_df.head(),' ',len(grouped_salaries_state_df))

                 name_full   playerID   salary
0  University of New Haven  bedrost01   550000
1  University of New Haven  bedrost01   650000
2  University of New Haven  bedrost01  1050000
3  University of New Haven  bedrost01   925000
4  University of New Haven  bedrost01  1450000   13369
    playerID   salary state
0  bedrost01   550000    CT
1  bedrost01   650000    CT
2  bedrost01  1050000    CT
3  bedrost01   925000    CT
4  bedrost01  1450000    CT   12676


bit more colleges than states?

In [24]:
## find most frequent colleges + states
top_colleges = grouped_salaries_college_df['playerID'].groupby(grouped_salaries_college_df['name_full']).count().sort_values(ascending = False)
print(top_colleges.head(10),'\n')

top_states = grouped_salaries_state_df['playerID'].groupby(grouped_salaries_state_df['state']).count().sort_values(ascending = False)
print(top_states.head(10))

name_full
Arizona State University                 226
Stanford University                      200
University of Southern California        189
University of California, Los Angeles    185
Louisiana State University               175
University of Texas at Austin            164
University of Miami                      158
University of Arizona                    149
University of Florida                    145
California State University Fullerton    144
Name: playerID, dtype: int64 

state
CA    2568
FL    1147
TX    1062
AZ     535
LA     446
NC     444
IL     419
OK     408
AL     356
TN     336
Name: playerID, dtype: int64


In [8]:
## create lists of top colleges and states
college_names = ['Arizona State University', 'University of Southern California', 'Stanford University', 
                 'University of California, Los Angeles', 'University of Texas at Austin', 'Louisiana State University',
                 'University of Miami', 'California State University Fullerton', 'Oklahoma State University', 
                 'University of Arizona']
state_names = ['CA','FL','TX','AZ','LA','OK','NC']

## get just those records from top schools and top states
top_colleges_avg_salary = mean_salary_player_college[mean_salary_player_college['name_full'].isin(college_names)][['name_full','salary']]
top_states_avg_salary = mean_salary_player_state[mean_salary_player_state['state'].isin(state_names)][['state','salary']]

top_colleges_med_salary = median_salary_player_college[median_salary_player_college['name_full'].isin(college_names)][['name_full','salary']]
top_states_med_salary = median_salary_player_state[median_salary_player_state['state'].isin(state_names)][['state','salary']]


print(top_colleges_avg_salary.head(10),'\n')
print(top_states_avg_salary.head(10))
print('\n\n',top_colleges_med_salary.head(10),'\n')
print(top_states_med_salary.head(10))

In [None]:
## get mean career salary of players by college and by state
mean_salary_player_college = salary_player_college.mean()
mean_salary_player_state = salary_player_state.mean()

print('Mean Salaries by College:','\n',mean_salary_player_college.head(),'\n')
print('Mean Salaries by State:','\n',mean_salary_player_state.head())

## get median career salary of players by college and by state
median_salary_player_college = salary_player_college.median()
median_salary_player_state = salary_player_state.median()

print('\n\nMedian Salaries by College:','\n',median_salary_player_college.head(),'\n')
print('Median Salaries by State:','\n',median_salary_player_state.head())

Notice that some players went to 2 colleges and therefore "played for" 2 states. I'll ignore that for this analysis.

Now I want to apply this data to just the most "popular" colleges and states, displayed below.

In [14]:
## find most frequent colleges + states
top_colleges = mean_salary_player_college['playerID'].groupby(mean_salary_player_college['name_full']).count().sort_values(ascending = False)
print(top_colleges.head(10),'\n')

top_states = mean_salary_player_state['playerID'].groupby(mean_salary_player_state['state']).count().sort_values(ascending = False)
print(top_states.head(10))

name_full
Arizona State University                 51
University of Southern California        44
Stanford University                      44
University of California, Los Angeles    41
University of Texas at Austin            41
Louisiana State University               40
University of Miami                      38
California State University Fullerton    34
Oklahoma State University                33
University of Arizona                    33
Name: playerID, dtype: int64 

state
CA    596
FL    275
TX    255
AZ    122
LA    103
OK    101
NC    101
IL     96
SC     81
AL     78
Name: playerID, dtype: int64


Now to double check we get the right counts by doing the same with the median data set.

In [15]:
top_colleges2 = median_salary_player_college['playerID'].groupby(median_salary_player_college['name_full']).count().sort_values(ascending = False)
print(top_colleges2.head(10),'\n')

top_states2 = median_salary_player_state['playerID'].groupby(median_salary_player_state['state']).count().sort_values(ascending = False)
print(top_states2.head(10))

name_full
Arizona State University                 51
University of Southern California        44
Stanford University                      44
University of California, Los Angeles    41
University of Texas at Austin            41
Louisiana State University               40
University of Miami                      38
California State University Fullerton    34
Oklahoma State University                33
University of Arizona                    33
Name: playerID, dtype: int64 

state
CA    596
FL    275
TX    255
AZ    122
LA    103
OK    101
NC    101
IL     96
SC     81
AL     78
Name: playerID, dtype: int64


Now, I want to place these colleges and states into a list and then filter out our DataFrames of the mean salaries to just contain these colleges and states.

Also, I am going to cut down the states to just those with over 100 records (players), just because.

In [16]:
## create lists of top colleges and states
college_names = ['Arizona State University', 'University of Southern California', 'Stanford University', 
                 'University of California, Los Angeles', 'University of Texas at Austin', 'Louisiana State University',
                 'University of Miami', 'California State University Fullerton', 'Oklahoma State University', 
                 'University of Arizona']
state_names = ['CA','FL','TX','AZ','LA','OK','NC']

## get just those records from top schools and top states
top_colleges_avg_salary = mean_salary_player_college[mean_salary_player_college['name_full'].isin(college_names)][['name_full','salary']]
top_states_avg_salary = mean_salary_player_state[mean_salary_player_state['state'].isin(state_names)][['state','salary']]

top_colleges_med_salary = median_salary_player_college[median_salary_player_college['name_full'].isin(college_names)][['name_full','salary']]
top_states_med_salary = median_salary_player_state[median_salary_player_state['state'].isin(state_names)][['state','salary']]


print(top_colleges_avg_salary.head(10),'\n')
print(top_states_avg_salary.head(10))
print('\n\n',top_colleges_med_salary.head(10),'\n')
print(top_states_med_salary.head(10))

                                name_full        salary
10          University of Texas at Austin  4.265000e+05
18                    Stanford University  1.478333e+05
20              Oklahoma State University  3.010000e+05
24             Louisiana State University  2.848750e+05
29                    Stanford University  2.615000e+05
39                    University of Miami  1.560000e+06
43      University of Southern California  1.700000e+05
49  University of California, Los Angeles  3.962500e+05
50                    Stanford University  1.780000e+05
51  University of California, Los Angeles  4.467000e+05 

   state        salary
1     TX  1.322821e+06
5     CA  1.295000e+05
6     FL  3.270000e+05
9     TX  4.265000e+05
10    NC  2.460000e+06
13    CA  1.805000e+05
15    TX  2.954400e+06
16    NC  3.295000e+05
17    CA  1.478333e+05
19    OK  3.010000e+05


                                 name_full     salary
10          University of Texas at Austin   367500.0
18                  

Next, I need to group these average and median career salaries by college and by state and then compute the mean and median in order to get the average and median career salary of all players that went to my selected colleges and went to college in my selected states.

In [17]:
# group the career salaries by college + state and inspect
print(np.array(top_colleges_avg_salary['salary'].groupby(top_colleges_avg_salary['name_full']).mean()),'\n')
print(np.array(top_states_avg_salary['salary'].groupby(top_states_avg_salary['state']).mean()),'\n')

print('\n\n',np.array(top_colleges_med_salary['salary'].groupby(top_colleges_med_salary['name_full']).mean()),'\n')
print(np.array(top_states_med_salary['salary'].groupby(top_states_med_salary['state']).mean()),'\n')

[ 1367384.31788712  1092950.99924765  1221145.03250888   860608.77326377
  1091013.31845238   853386.00467136  1283692.32301421  1658218.56270075
  1579411.07092175   868401.947516  ] 

[ 1155620.47824875  1110263.94430724  1035668.07309999  1087084.27247566
   837998.14845935   881364.40639949  1021279.06790348] 



 [ 1309657.23529412   729725.          1058594.1625       801268.15151515
   920200.73863636   743890.04545455  1207705.85365854  1569823.10526316
  1342062.78409091   768847.65853659] 

[ 1037618.94262295   966142.22567114   948290.70909091   873990.69417476
   678176.33168317   785640.64356436   884478.48431373] 



So I've got 10 mean and median career salaries by college and 7 mean and median career salaries by state. Now, I am going to sort these college names and state names alphabetically, grab their mean and median career salaries and place them into NumPy arrays.

Then I need to construct 3 DataFrames, wherein the 1st column will be the college or state name, respectively, and the next 2 columns would be the mean career salary and median career salary for that college or state, respectively.

I am going to change my index from the full college name to the typical abbreviations for these colleges that one would see for during a sports broadcast in order to make them easier to plot on an axis.

In [18]:
## sort indices and create arrays of mean and median career salaries
sorted_colleges = sorted(['ASU', 'USC', 'STN','UCLA', 'UT', 'LSU','MIA', 'CAL', 'OSU','ARI'])
med_salary_college = np.array(top_colleges_avg_salary['salary'].groupby(top_colleges_avg_salary['name_full']))#.median())
avg_salary_college = np.array(top_colleges_avg_salary['salary'].groupby(top_colleges_avg_salary['name_full']))#.mean())

ValueError: setting an array element with a sequence

In [None]:
## sort indices and create arrays of mean and median career salaries
sorted_colleges = sorted(['ASU', 'USC', 'STN','UCLA', 'UT', 'LSU','MIA', 'CAL', 'OSU','ARI'])
med_salary_college = np.array(top_colleges_avg_salary['salary'].groupby(top_colleges_avg_salary['name_full']))#.median())
avg_salary_college = np.array(top_colleges_avg_salary['salary'].groupby(top_colleges_avg_salary['name_full']))#.mean())

sorted_states = sorted(top_states_avg_salary['state'].unique())
med_salary_states = np.array(top_states_avg_salary['salary'].groupby(top_states_avg_salary['state']).median())
avg_salary_states = np.array(top_states_avg_salary['salary'].groupby(top_states_avg_salary['state']).mean())

## create DataFrames
career_salary_college = pd.DataFrame({
    'med_salary_college': med_salary_college,
    'avg_salary_college': avg_salary_college
}, index = sorted_colleges)

career_salary_state = pd.DataFrame({
    'med_salary_states': med_salary_states,
    'avg_salary_states': avg_salary_states
}, index = sorted_states)

# inspect DataFrames
print(career_salary_college.head(),'\n')
print(career_salary_state.head())

Now it's time to plot them

In [None]:
%matplotlib inline

fig, axes = plt.subplots(1, 2, figsize=(15,8)) 

fig.suptitle("College Data", fontsize = 20, y = 0.95)

career_salary_college['med_salary_college'].plot(kind = 'bar', ax = axes[0], color = 'g')
career_salary_college['avg_salary_college'].plot(kind = 'bar', ax = axes[1], color = 'g')

axes[0].set_title('Median Career Salary of The Most Popular Colleges')
axes[0].set_xlabel('College')
axes[0].set_ylabel('Salary ($)')
axes[0].set_xticklabels(career_salary_college.index, rotation=45)

axes[1].set_title('Average Career Salary of The Most Popular Colleges')
axes[1].set_xlabel('College')
axes[1].set_ylabel('Salary ($)')
axes[1].set_xticklabels(career_salary_college.index, rotation=45)

plt.show()

***Note the y-axis differences***. I will keep this view so as to show the differences between colleges more easy to read.

So we can see that players that went to University of Southern California had the highest median salary (~ 1.5x more than most other colleges), as well as the second highest average salary. This indicates that this could possible be a good choice of school for a player to earn a lot of money in the pros.

The average salaries were a lot more varied among colleges, with UCLA being to college with the highest overall salary and Oklahoma State University being the lowest, with the University of Miami and University of Texas - Austin not far above.

I'd assume these distributions would not be too normal, so the median would be the better bar plot to take note of.

In [None]:
grouped_salaries_college['salary'].head()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15,8)) 

fig.suptitle("State Data", fontsize = 20, y = 0.95)

career_salary_state['med_salary_states'].plot(kind = 'bar', ax = axes[0], color = 'g')
career_salary_state['avg_salary_states'].plot(kind = 'bar', ax = axes[1], color = 'g')

axes[0].set_title('Median Career Salary by The Most Popular States for College')
axes[0].set_xlabel('State for College')
axes[0].set_ylabel('Salary ($)')
axes[0].set_xticklabels(career_salary_state.index, rotation=45)
    
axes[1].set_title('Average Career Salary by Most Popular States for College')
axes[1].set_xlabel('State for College')
axes[1].set_ylabel('Salary ($)')
axes[1].set_xticklabels(career_salary_state.index, rotation=45)

plt.show()

It looks like this dataset doesn't give an indication of the best state to go to college in in order bring up one's chances to make a higher career salary.

North Carolina has the highest median career salary, followed by California and Arizona. But, Arizona has the highest average career salary, followed by California and Louisiana.

So, at least according to this, maybe Arizona and California are the states one would want to look at to play for in college in order to bolster their chances at making a greater salary over their career in the pros. From the college analysis earlier, it looks like USC could be the overall best best in this case.

<h1> Maps </h1>

Now let's look at the mean and median salary for *all* states.

First I'll check some code to see the mean salary by state.

In [None]:
grouped_salaries_college.groupby(grouped_salaries_college['state'])
#print(mean_salary_player_state['salary'].groupby(mean_salary_player_state['state']).mean().head())
#print(mean_salary_player_state['salary'].groupby(mean_salary_player_state['state']).mean().tail())

Now I want to sort by state name alphabetically and then group 

In [None]:
sorted_states = sorted(mean_salary_player_state['state'].unique())
med_salary_states = np.array(mean_salary_player_state['salary'].groupby(mean_salary_player_state['state']).median())
avg_salary_states = np.array(['salary'].groupby(mean_salary_player_state['state']).mean())

average_career_salary_state = pd.DataFrame({
    'med_salary_states': med_salary_states,
    'avg_salary_states': avg_salary_states
}, index = sorted_states)

In [None]:
average_career_salary_state

In [None]:
import folium

state_geo = 'C:/folium-master/examples/data/us-states.json' # from w/in folium package

# Setup a folium map at a high-level zoom @Alok (100,0) are NOT lat long coordinates
map = folium.Map(location = [39, -97], zoom_start = 3.5)

# use built-in method "choropleth" for choropleth maps that bind Pandas Data Frames and json geometries together.  
# This allows us to quickly visualize data combinations

plot_data = average_career_salary_state['med_salary_states']

map.choropleth(geo_path = state_geo, data = plot_data,
               columns = ['StateCode','Value'],
               key_on = 'feature.id',  # refers to label in the JSON object that has country code as the feature ID
               fill_color = 'YlGnBu', fill_opacity = 0.7,
               line_opacity = 0.2,
              # legend_name = legend_label
              )

# Create Folium plot
map.save('plot_data.html')

# Import the Folium interactive html file
from IPython.display import HTML
HTML('<iframe src=plot_data.html width=700 height=450></iframe>')

In [None]:
## group dataframe by team, check mean salary of each

grouped_salaries_team = salaries_college_state.groupby(['teamID'], as_index=False)
grouped_salaries_team['salary'].mean().head()

In [None]:
import folium # conda install -c ioos folium=0.2.1 

state_geo = 'C:/folium-master/examples/data/us-states.json' # from w/in folium package

# Setup a folium map at a high-level zoom @Alok (100,0) are NOT lat long coordinates
map = folium.Map(location = [39, -97], zoom_start = 3.5)

# use built-in method "choropleth" for choropleth maps that bind Pandas Data Frames and json geometries together.  
# This allows us to quickly visualize data combinations

plot_data = average_career_salary_state['avg_salary_states']

map.choropleth(geo_path = state_geo, data = plot_data,
               columns = ['StateCode','Value'],
               key_on = 'feature.id',  # refers to label in the JSON object that has country code as the feature ID
               fill_color = 'YlGnBu', fill_opacity = 0.7,
               line_opacity = 0.2,
              # legend_name = legend_label
              )

# Create Folium plot
map.save('plot_data.html')

# Import the Folium interactive html file
from IPython.display import HTML
HTML('<iframe src=plot_data.html width=700 height=450></iframe>')

In [None]:
team_avg_salary = grouped_salaries_team['salary'].mean()
team_med_salary = grouped_salaries_team['salary'].median()

print(team_avg_salary.head())
print(team_med_salary.head())

In [None]:
## create DataFrames of average and median salaries by team
team_avg_salary = grouped_salaries_team['salary'].mean()
team_med_salary = grouped_salaries_team['salary'].median()

print(team_avg_salary.head())
print(team_med_salary.head())

In [None]:
## put the DataFrame into list for the index and arrays for the data
sorted_teams = sorted(team_avg_salary['teamID'].unique())
avg_team_salary = np.array(team_avg_salary['salary'])
med_team_salary = np.array(team_med_salary['salary'])

print(sorted_teams[:5])
print(avg_team_salary[:5])
print(med_team_salary[:5])

In [None]:
## create new DataFrame with both arrays as columns
average_team_salary_df = pd.DataFrame({
    'med_salary': med_team_salary,
    'avg_salary': avg_team_salary
}, index = sorted_teams)

average_team_salary_df.head()

In [None]:
## Create 2 new DataFrames, sorted by respective columns
sorted_avg_salary = average_team_salary_df.sort_values(by='avg_salary', ascending=False)
sorted_med_salary = average_team_salary_df.sort_values(by='med_salary', ascending=False)

print(sorted_avg_salary.head(),'\n')
print(sorted_med_salary.head())

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15,8)) 

fig.suptitle("Team Data", fontsize = 20, y = 0.95)

sorted_avg_salary['avg_salary'].head(10).plot(kind = 'bar', ax = axes[1])
sorted_med_salary['med_salary'].head(10).plot(kind = 'bar', ax = axes[0])

axes[0].set_xlabel('Median')
#axes[0].set_xticks()

axes[1].set_xlabel('Average')
#axes[1].set_xticks(rotation=45)

plt.show()