In [None]:
%pylab inline

import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from __future__ import division

In [None]:
df = pd.read_csv("../input/free_throws.csv")
df.head(2)

# Table of Contents

<ol>
<li>Overview Statistics</li>
<li>Top 10 Statistics</li>
<li>Other Topics</li>
</ol>

# 1. Overview Statistics

## 1.1 Number of Games per Season

In [None]:
games = df.drop_duplicates("game_id") \
          .groupby(["season", "playoffs"]).size() \
          .unstack()
games.head(3)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,5))
plt.suptitle("Number of Games per Season", y=1.03, fontsize=20)

games.regular.plot(marker="o", rot=90, title="Regular Season", color="#41ae76", ax=ax[0])
games.playoffs.plot(marker="o", rot=90, title="Playoffs", ax=ax[1])

In the regular season there are 1231 games played (30 teams playing 82 games each plus one All-star game), except for the 2011-2012 season which was shortened due to a lockout. Thus, there is a big drop in the diagram. The number of games are also not exactly 1231 for all seasons because for some games there was simply no data available during the scraping process.

The playoffs are played in a best-of-seven mode and that's why the number of games vary.

## 1.2 Average Number of Free Throws per Game by Season

In [None]:
ft_total = df.groupby(["season", "playoffs"]).size() \
             .unstack()
ft_total.head(3)

In [None]:
ft_per_game = ft_total / games
ft_per_game.head(2)

In [None]:
ft_per_game.plot(marker="o", rot=90, figsize=(12,5))
plt.title("Average Number of Free Throws per Game", fontsize=20)

plt.arrow(5.3, 51, -0.5, -1.2, width=0.01, color="k", head_starts_at_zero=False)
plt.text(4.8, 51.2, "Change of Rules")

As expected, the number of free throws per game is higher for playoff games than for regular season games (although only slightly in the first and last season of this data set). Overall, one can see that there is a decline of free throws per game in the course of the seasons. 

There is an especially deep drop from season 2010-2011 to 2011-2012 and it moves almost in parallel for regular season and playoff games. So, there must have been some kind of change regarding the rules of what constitutes a foul. And sure enough, I found this article which confirmed my suspicion: http://www.espn.com/nba/story/_/id/7329584/nba-alters-emphasis-shooting-fouls-2011-12

## 1.3 Number of Free Throws per Period

In [None]:
periods = df.groupby(["game_id", "playoffs", "period"]).size() \
            .unstack(["playoffs", "period"]) \
            .describe()[:2] \
            .stack().unstack(0) \
            .swaplevel(0, 1, axis=1).sortlevel(axis=1)
            
periods

There were only 7 playoff games that went into the 6th period, so I am not going to include them (or higher periods) into the following graph.

In [None]:
periods["mean"][:5].plot(marker="o", xticks=(1,2,3,4,5), xlim=(0.8, 5.2), figsize=(8,5))
plt.title("Average Number of Free Throws", fontsize=20)

Here again, playoff games have an higher average than regular season games (across all periods). And as expected, as the game comes closer to the end the number of free throws increases with the highest average being in the fourth quarter.

There is a huge drop in the fifth quarter because periods in overtime are only 5 minutes long. In order to compare them with the first 4 periods (which are 12 minutes long), I am going to calculate the average number of free throws per minute per period.

In [None]:
periods["minutes"] = [12,12,12,12,5,5,5,5]
periods["playoffs"] = periods["mean"].playoffs / periods.minutes
periods["regular"] = periods["mean"].regular / periods.minutes
periods

In [None]:
per_minute = periods[["playoffs", "regular"]][:5]
per_minute.columns = per_minute.columns.droplevel(1)

per_minute.plot(marker="o", xticks=(1,2,3,4,5), xlim=(0.8, 5.2), figsize=(8,5))
plt.title("Average Number of Free Throws per Minute", fontsize=20)

Now, the pattern is more clear. The closer the game gets to the end, the higher the number of free throws. Let's see if that also applies to the actual playing time left.

## 1.4 Number of Free Throws: Seconds left

In [None]:
# excluding free throws that were made during overtime
df_seconds_left = df[df.period <= 4]

In [None]:
def determining_seconds_left(row):
    
    minutes_left_period = int(row.time.split(":")[0])
    seconds_left_period = int(row.time.split(":")[1])
    
    remaining_periods = 4 - row.period
    remaining_seconds = remaining_periods * 12 * 60
    
    seconds_left_total = minutes_left_period * 60 + seconds_left_period + remaining_seconds
    
    return seconds_left_total

In [None]:
df_seconds_left["seconds_left"] = df_seconds_left.apply(determining_seconds_left, axis=1)

In [None]:
df_seconds_left.seconds_left.hist(bins=48, figsize=(15,4))

plt.xlabel("Seconds left")
plt.title("Number of Free Throws", fontsize=20)
plt.xticks(range(0, 3240, 360))
plt.xlim(0, 2880)

plt.vlines([12*60, 24*60, 36*60], 0, 40000, linestyle="--")
plt.text(250, 37000, "4th Quarter")
plt.text(970, 37000, "3rd Quarter")
plt.text(1690, 37000, "2nd Quarter")
plt.text(2410, 37000, "1st Quarter")

There is a same pattern across all 4 periods. The closer the game is towards the end of the quarter, the more free throws there are. This makes sense since once the number of team fouls equals 6, a foul automatically leads to free throws. 

Furthermore, one can see the same patterns as in the diagrams in section 1.3, namely that the smallest amount of free throws occurs in the first quarter, that the second and third quarter have nearly the same amount of free throws and that most free throws are made in the fourth quarter (this is especially due to the last minute of that quarter).

# 2. Top 10 Statistics

## 2.1 Shooting Percentages

### Overview

In [None]:
shooting = df.groupby(["player"])["shot_made"].agg(["size", "mean"])
shooting = shooting.rename(columns={"size": "ft_count", "mean": "percentage"})

# to make sure the shooting percentages are valid, the players should have at least taken 100 shots
shooting = shooting[shooting.ft_count>=100]

shooting.head(3)

In [None]:
shooting.percentage.hist(bins=50, figsize=(8,5))

plt.title("Distribution of Shooting Percentages", fontsize=20)
plt.xlabel("Shooting Percentage")

plt.vlines(x=shooting.percentage.median(), ymin=0, ymax=45, color="red", linestyle="--")
plt.text(x=0.72, y=-1.3, s="median", color="red")

The distribution of shooting percentages looks like a left-skewed normal distribution. The majority of players shoot between 70% and slightly above 80%. The best shoot around 90% and the worst shoot less than 50%. The median is at 75.7%.

### The best Shooters

In [None]:
shooting.sort_values(by="percentage", ascending=False)[:10]

### The worst Shooters

In [None]:
shooting.sort_values(by="percentage")[:10]

## 2.2 Free Throws per Game

### Overview

In [None]:
ft_per_game = df.groupby(["player", "game_id"]).size() \
                .unstack("player") \
                .mean().sort_values(ascending=False)
        
# adding shooting percentages from the shooting dataframe
ft_per_game = pd.DataFrame({"ft_per_game": ft_per_game})
ft_per_game["percentage"] = shooting.percentage

# dropping those players that had less than 100 shots in the shooting dataframe
ft_per_game = ft_per_game.dropna()

In [None]:
ft_per_game.ft_per_game.hist(bins=50, figsize=(8,5))

plt.title("Distribution of Free Throws per Game per Player", fontsize=20)
plt.xlabel("Number of Free Throws")
plt.vlines(x=ft_per_game.ft_per_game.median(), ymin=0, ymax=70, color="red", linestyle="--")
plt.text(x=3.08, y=-2, s="median", color="red")

This is a right-skewed normal distribution and most of the players have between 2 and 4 free throws a game. A minority has between 5 and up to almost 10 free throws per game.

### Most Free Throws per Game by Player

In [None]:
ft_per_game.head(10)

All players but the first two have a shooting percentage above the median of 75.7% (LeBron James is slightly below that). This makes sense since those are also the top players of their respective teams and hence they have the ball more often and get also fouled more often.

Dwight Howard, however, who has by far the most three frows per game, also has the worst shooting percentage. This seems strange at first but is simply due to Hack-a-Shaq tactic applied by the opponent teams.

## 2.3 Consistency of Shooting across all Seasons

Let's see which players have the most stable shooting percentage across the seasons and which the most unstable.

### Most consistent Players

In [None]:
shooting_per_season = df.groupby(["player", "season"])["shot_made"].agg(["mean", "size"])

# player has at least 100 shots per season
shooting_per_season = shooting_per_season[shooting_per_season["size"]>=100]

# dropping level "size"
shooting_per_season = shooting_per_season.drop("size", axis=1).unstack("player")

# removing the hierarchical index "mean"
shooting_per_season.columns = shooting_per_season.columns.droplevel()

# there are at least 5 seasons of data
shooting_per_season = shooting_per_season.dropna(axis=1)

shooting_std = shooting_per_season.std()

# adding the overall shooting percentage as reference
shooting_std = pd.DataFrame({"std": shooting_std})
shooting_std["shooting_percentage"] = shooting.percentage

shooting_std.sort_values(by="std").head(10)

### Most inconsistent Players

In [None]:
shooting_std.sort_values(by="std", ascending=False).head(10)

### Visualization of Shooting Consistency

In [None]:
shooting_std.plot(kind="scatter", x="shooting_percentage", y="std", figsize=(8,5))
plt.title("Consistency of Shooting in relation to Shooting Percentage", fontsize=15)

As one would expect, there is a negative correlation between the shooting percentage of a player and the standard deviation of his shooting percentage across all seasons. The better the shooting percentag of a player is, the lower is his standard deviation, meaning that he shoots more consistently.

In [None]:
most_inconsistent = shooting_std.sort_values(by="std", ascending=False).head(3).index
most_consistent = shooting_std.sort_values(by="std").head(3).index

fig, ax = plt.subplots(1,2, figsize=(20,5), sharey=True)

ax1 = shooting_per_season[most_consistent].plot(marker="o", rot=90, ax=ax[0], title="Top 3: Most Consistent")
ax2 = shooting_per_season[most_inconsistent].plot(marker="o", rot=90, ax=ax[1], title="Top 3: Most Inconsistent")

plt.setp(ax2.get_yticklabels(), visible=True)
plt.suptitle("Shooting Percentages over all 10 Seasons", y=1.03, fontsize=20)

One can clearly see that the lines on the left graph move more smoothly whereas the lines on the right graph are more scattered. Furthermore, one also sees that the more consistent shooters tend to have a higher shooting percentage.

# 3. Other Topics

# 3.1 Performance under Pressure

Let's see how the players handle pressure, i.e. how their shooting percentage changes. I define a pressure situation as the last two minutes of a game and the score difference is within 5 points (5 points ahead or 5 points behind).

## Determining the seconds left of a quarter

In [None]:
def seconds_left(value):
    time = value.split(":")
    seconds = int(time[0]) * 60 + int(time[1])
    return seconds

In [None]:
df["seconds"] = df.time.apply(seconds_left)

## Determining which team shot the free throw

### 1. step

If a player hits his second of two shots or his third of three shots, then the score column changes and I can use the score of the previous shot to check which number changed. That way I can determine to which team the respective free throw belongs.

In [None]:
df["previous_score"] = df.score.shift()

In [None]:
def who_shot(row):
    if ("makes free throw 2 of 2" in row.play) or ("makes free throw 3 of 3") in row.play:
        if row.score.split(" - ")[0] != row.previous_score.split(" - ")[0]:
            return row.game.split(" - ")[0]
        else:
            return row.game.split(" - ")[1]
    else:
        return np.nan

In [None]:
df["team"] = df.apply(who_shot, axis=1)

### 2. step

Now, I have a unique player and game_id combination for every free throw for which I determined which team shot it. I can use that to determine the team for every other free throw that that specific player took during the same game.

In [None]:
df.head(2)

For example, in the previous step I determined that Andrew Bynum belongs to the Los Angeles Lakers (in the above table, it is determined which team shot the second free throw but not which team shot the first). Then, it's obvious that all other free throws he took during that same game, also have to belong to LAL.

In [None]:
players = {}

for index, row in df[df.team.notnull()].iterrows():
    try:
        players[row.player][row.game_id] = row.team
    except KeyError:
        players[row.player] = {row.game_id: row. team}

In [None]:
def who_shot(row):
    try:
        return players[row.player][row.game_id]
    except KeyError:
        return np.nan

In [None]:
df["team"] = df.apply(who_shot, axis=1)

## Determining the score difference

In [None]:
def find_score_difference(row):
    
    teams = row.game.split(" - ")
    scores = row.score.split(" - ")
    
    if row.team == teams[0]:
        own_score = int(scores[0])
        opponent_score = int(scores[1])
        
        if row.shot_made == 1:
            own_score = own_score - 1

    elif row.team == teams[1]:
        own_score = int(scores[1])
        opponent_score = int(scores[0])
        
        if row.shot_made == 1:
            own_score = own_score - 1
    else:
        return np.nan
    
    return own_score - opponent_score

In [None]:
df["score_difference"] = df.apply(find_score_difference, axis=1)

In [None]:
df.score_difference.hist(bins=50, figsize=(8,5))

plt.title("Distribution of Score Differences", fontsize=20)
plt.xlabel("Score Difference")

The distribution of the score differences of every free throw looks like a normal distribution which makes sense. So, I can be pretty sure that the data wrangling process which led to the score differences doesn't include major errors.

In [None]:
print("Number of free throws for which the team is determined: ", len(df[df.team.notnull()]))
print("Percentage of free throws for which the team is determined: ", len(df[df.team.notnull()]) / len(df), "%")

## Visualization of Performance under Pressure

In [None]:
high_pressure = df[(df.period>=4) & (df.seconds<=120) & (df.score_difference<=5) & (df.score_difference>=-5)]

high_pressure = high_pressure.groupby("player")["shot_made"].agg(["mean", "size"])
high_pressure = high_pressure.rename(columns={"mean": "percentage_pressure", "size": "count_pressure"})

high_pressure = high_pressure[high_pressure["count_pressure"] >= 100]

In [None]:
pressure = pd.merge(high_pressure, shooting, left_index=True, right_index=True)
pressure.head(3)

In [None]:
worst_percentage = pressure.percentage.min()

pressure.plot(kind="scatter", x="percentage", y="percentage_pressure", figsize=(8,5))
plt.plot([worst_percentage, 1], [worst_percentage, 1], color="red")

plt.title("Shooting Performance under Pressure", fontsize=20)
plt.xlabel("Shooting Percentage: Career")
plt.ylabel("Shooting Percentage: Pressure")

The red line corresponds to the line where there is no difference between the career's shooting percentage of a player and his shooting percentage under pressure. So, every player below that line shoots worse under pressure and every player above it, shoots better under pressure.

There is no clear pattern in the diagram for example that every player shoots worse under pressure or that only the top players shoot better in pressure situations. So, let's have a look whose shooting percentage decreases the most and whose percentage improves the most (under the chosen specific pressure conditions).

In [None]:
pressure["difference"] = pressure.percentage_pressure - pressure.percentage

### Most worsen under Pressure

In [None]:
pressure.sort_values(by="difference").head(10)

### Most improved under Pressure

In [None]:
pressure.sort_values(by="difference", ascending=False).head(10)

# 3.2 Winner Effect

Studies have shown that when we experience a victory or what we perceive to be a victory, no matter how small, the probability that we will win our next contest goes up significantly. So, let's check how the shooting percentages change if a player made his previous free throw.

In [None]:
df["shot_made_previous"] = df.shot_made.shift()

winner_effect = df[(df.play.str.contains("2 of 2")) | (df.play.str.contains("2 of 3")) | (df.play.str.contains("3 of 3"))]

## Shooting Percentage after Success

In [None]:
made_previous = winner_effect[winner_effect.shot_made_previous==1].groupby("player")["shot_made"].agg(["mean", "size"])
made_previous = made_previous.rename(columns={"mean": "percentage_success", "size": "count_success"})
made_previous = made_previous[made_previous.count_success>=200]

made_previous = pd.merge(made_previous, shooting, left_index=True, right_index=True)
made_previous.head(3)

In [None]:
made_previous.plot(kind="scatter", x="percentage", y="percentage_success", figsize=(8,5))

worst_percentage = made_previous.percentage.min()
plt.plot([worst_percentage, 1], [worst_percentage, 1], color="red")

plt.title("Shooting Performance after Success", fontsize=20)
plt.xlabel("Shooting Percentage: Career")
plt.ylabel("Shooting Percentage: after Succes")

Again, the red line is the line where there is no difference between the two conditions. One can clearly see that the great majority is above the line which means their shooting percentage goes up if they made their previous free throw. Hence, the winner effect holds true for free throw shooting.

Let's see who has the biggest boost in performance:

In [None]:
made_previous["difference"] = made_previous.percentage_success - made_previous.percentage

In [None]:
made_previous.sort_values(by="difference", ascending=False).head(10)

## Shooting Percentage after Failure

Let's check if there is also an opposite to the winner effect.

In [None]:
missed_previous = winner_effect[winner_effect.shot_made_previous==0].groupby("player")["shot_made"].agg(["mean", "size"])
missed_previous = missed_previous.rename(columns={"mean": "percentage_failure", "size": "count_failure"})
missed_previous = missed_previous[missed_previous.count_failure>=200]

missed_previous = pd.merge(missed_previous, shooting, left_index=True, right_index=True)
missed_previous.head(3)

In [None]:
missed_previous.plot(kind="scatter", x="percentage", y="percentage_failure", figsize=(8,5))

worst_percentage = missed_previous.percentage.min()
plt.plot([worst_percentage, 1], [worst_percentage, 1], color="red")

plt.title("Shooting Performance after Failure", fontsize=20)
plt.xlabel("Shooting Percentage: Career")
plt.ylabel("Shooting Percentage: after Failure")

Here, there seems to be no clear pattern. So, the opposite effect of the winner effect doesn't hold true.