In [82]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from utility import pval_pearson_coef, print_column_names

In [85]:
# Read CSV Files
plays_df = pd.read_csv("datafiles/plays.csv")
players_df = pd.read_csv("datafiles/players.csv")
week1_df = pd.read_csv("datafiles/week1.csv")
week2_df = pd.read_csv("datafiles/week2.csv")
week3_df = pd.read_csv("datafiles/week3.csv")

for df in [week1_df, week2_df, week3_df, plays_df, players_df]:
    print("{}".format(df.shape))

(986022, 19)
(1231793, 19)
(1168345, 19)
(19239, 27)
(1303, 7)


In [86]:
# join play data with weekly data
week1plays_df = week1_df.join(plays_df, lsuffix="_left")
week2plays_df = week2_df.join(plays_df, lsuffix="_left")
week3plays_df = week3_df.join(plays_df, lsuffix="_left")

for df in [week1plays_df, week2plays_df, week3plays_df]:
    print("{}".format(df.shape))
    print("{}".f)

(986022, 46)
(1231793, 46)
(1168345, 46)


In [87]:
# Concat the dataframes
df = pd.concat([week1plays_df, week2plays_df, week3plays_df])
df[df.columns[~df.isnull().all()]]

Unnamed: 0,time,x,y,s,a,dis,o,dir,event,nflId,...,preSnapHomeScore,gameClock,absoluteYardlineNumber,penaltyCodes,penaltyJerseyNumbers,passResult,offensePlayResult,playResult,epa,isDefensivePI
0,2018-09-07T01:07:14.599Z,91.73,26.67,0.00,0.01,0.02,289.57,240.93,,310.0,...,0.0,15:00:00,90.0,,,C,10.0,10.0,0.261827,False
1,2018-09-07T01:07:14.599Z,88.89,36.47,0.01,0.01,0.01,105.63,66.66,,79848.0,...,0.0,13:10:00,49.0,,,I,0.0,0.0,-0.372360,False
2,2018-09-07T01:07:14.599Z,91.35,44.16,0.02,0.03,0.01,290.45,16.86,,2495454.0,...,0.0,13:05:00,49.0,,,I,0.0,0.0,-0.702779,False
3,2018-09-07T01:07:14.599Z,86.31,22.01,0.09,0.42,0.01,70.12,168.91,,2495613.0,...,0.0,13:01:00,49.0,,,C,33.0,33.0,3.047530,False
4,2018-09-07T01:07:14.599Z,90.78,36.15,0.00,0.00,0.00,257.61,193.97,,2533040.0,...,0.0,10:59:00,11.0,,,I,0.0,0.0,-0.842272,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1168340,2018-09-25T03:28:10.099Z,61.56,35.42,3.72,2.12,0.38,173.25,155.07,,2557978.0,...,,,,,,,,,,
1168341,2018-09-25T03:28:10.099Z,59.80,30.25,4.56,4.01,0.47,47.07,44.73,,2558149.0,...,,,,,,,,,,
1168342,2018-09-25T03:28:10.099Z,48.49,6.31,3.23,1.01,0.32,65.02,54.59,,2560930.0,...,,,,,,,,,,
1168343,2018-09-25T03:28:10.099Z,74.81,46.15,3.31,2.33,0.34,171.25,118.36,,2561039.0,...,,,,,,,,,,


In [None]:
print_column_names(df)

In [None]:
cleaned_df = df[["x", "y", "s", "a", "dis", "nflId", "defendersInTheBox", "numberOfPassRushers", "passResult", "playResult", "epa"]]
cleaned_df.corr()

In [None]:
# Groups correlations Results based on playResult in yards

stats_positive_gain = cleaned_df[cleaned_df["playResult"] > 0].corr()
stats_positive_gain.rename(columns=lambda x: x + " positive gain", inplace=True)

stats_negative_gain = cleaned_df[cleaned_df["playResult"] < 0].corr()
stats_negative_gain.rename(columns=lambda x: x + " negative gain", inplace=True)

stats_zero_gain = cleaned_df[cleaned_df["playResult"] == 0].corr()
stats_zero_gain.rename(columns=lambda x: x + " zero gain", inplace=True)

for corr_results in [stats_positive_gain, stats_negative_gain, stats_zero_gain]:
    print(corr_results)

In [None]:
# Groups correlations Results based on passResult

stats_completion = cleaned_df[cleaned_df["passResult"] == "C"].corr()
stats_completion.rename(columns=lambda x: x + " Completion", inplace=True)

stats_incompletion = cleaned_df[cleaned_df["passResult"] == "I"].corr()
stats_incompletion.rename(columns=lambda x: x + " Incompletion", inplace=True)

stats_sack = cleaned_df[cleaned_df["passResult"] == "S"].corr()
stats_sack.rename(columns=lambda x: x + " Sack", inplace=True)

stats_interception = cleaned_df[cleaned_df["passResult"] == "IN"].corr()
stats_interception.rename(columns=lambda x: x + " Interception", inplace=True)

for corr_results in [stats_completion, stats_incompletion, stats_sack, stats_interception]:
    print(corr_results)

In [None]:
# Histograms

plt.style.use("ggplot")
cleaned_df.x.plot(kind="hist", color="blue", edgecolor="black", figsize=(10,7))
plt.title("Distribution of X", size=24)
plt.xlabel("X Coordinates", size=18)
plt.ylabel("Frequency", size=18)

plt.style.use("ggplot")
cleaned_df.y.plot(kind="hist", color="blue", edgecolor="black", figsize=(10,7))
plt.title("Distribution of Y", size=24)
plt.xlabel("Y Coordinate", size=18)
plt.ylabel("Frequency", size=18)

plt.style.use("ggplot")
cleaned_df.playResult.plot(kind="hist", color="blue", edgecolor="black", figsize=(10,7))
plt.title("Distribution of playResult", size=24)
plt.xlabel("playResult in Yards", size=18)
plt.ylabel("Frequency", size=18)

In [None]:
# Scatter plot of passResults

cleaned_df.plot(kind="scatter", x="x", y="playResult", color="blue", figsize=(10,7))
plt.title("Relationship between", size=24)
plt.xlabel("x", size=18)
plt.ylabel("playResult", size=18)

In [None]:
pval_pearson_coef(cleaned_df.x, cleaned_df.y)

In [None]:
for item in cleaned_df.playResult:
    if item == None:
        print(item)

In [None]:
cleaned_df.playResult.head(50)

In [None]:
cleaned_df.groupby("playResult").playResult.count()