In [None]:
!pip install git+https://github.com/fastai/fastai@2e1ccb58121dc648751e2109fc0fbf6925aa8887

In [None]:
!apt update && apt install -y libsm6 libxext6

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from fastai.imports import *
from fastai.structured import *

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn import metrics

In [None]:
!ls ../input/nfl-big-data-bowl-2020/

# Read csv file

In [None]:
path = Path('../input/nfl-big-data-bowl-2020/')
path_train_csv = path/'train.csv'

Setting low_memory to False let pandas consume all memories it can use

In [None]:
df = pd.read_csv(path_train_csv,low_memory=False)
df.head()

Check total column names

In [None]:
df.columns

Be creative with Time-series, Make time to be in different forms
Such as

1. Year?
2. Month?
3. Week?
4. Day?
5. Elaspsed... etc

Please check in the below section

In [None]:
df['TimeSnap'].head()

In [None]:
df = pd.read_csv(path_train_csv,low_memory=False,parse_dates=['TimeSnap'])
df['TimeSnap'].head()

Handy function that can show up to 1000 col / rows, instead of pandas ...

In [None]:
def display_all(df):
    with pd.option_context('display.max_rows',1000,'display.max_columns',1000):
        display(df)

In [None]:
display_all(df.tail().T)
print(df.shape)

In [None]:
df = pd.read_csv(path_train_csv,low_memory=False,parse_dates=['TimeSnap','TimeHandoff'])

In [None]:
display_all(df.tail().T)
print(df.shape)

In [None]:
display_all(df.describe(include='all').T)

In [None]:
add_datepart(df,'TimeSnap')
add_datepart(df,'TimeHandoff')

In [None]:
display_all(df.tail().T)
print(df.shape)

In [None]:
display_all(df.isnull().sum().sort_index()/len(df))

Take a look of our independent variable

In [None]:
df['Yards'].describe()

# Continous / Categorical values

In [None]:
train_cats(df)

In [None]:
display_all(df.tail().T)
print(df.shape)

In [None]:
from sklearn.model_selection import GroupShuffleSplit
train_idxs, valid_idxs = next(GroupShuffleSplit(test_size=.2, n_splits=2, random_state = 42).split(df, groups=df['GameId']))
df_train = df.iloc[train_idxs]
df_valid = df.iloc[valid_idxs]

In [None]:
df_train_final,y,nas = proc_df(df_train,'Yards')
df_valid_final,_,_ = proc_df(df_valid,na_dict=nas)
df_train_final.shape,y.shape,df_valid_final.shape

In [None]:
len(df_train['GameId'].unique()),len(df_valid['GameId'].unique())

# Check No cross fields

In [None]:
df_train['GameId'].equals(df_valid['GameId'])

# Train with small samples

What set_rf_samples does is everytime random forest draws sample, takes 50,000 from df_train with replacement.

Therefore, each of which are differented (by default, the tree estimator=10), with a probablity of ??? not a math person, but you can figure it out.

Guess: (50000 / 407484) ** 10???

In [None]:
set_rf_samples(80000)

In [None]:
model_first = RandomForestRegressor(n_estimators=40, min_samples_leaf=3,n_jobs=-1)
%time model_first.fit(df_train_final,y)

In [None]:
df_valid_final.shape

In [None]:
y_valid = df_valid_final['Yards']
df_valid_final.drop(['Yards'],axis=1,inplace=True)
df_valid_final.shape,y_valid.shape

In [None]:
preds = model_first.predict(df_train_final)
preds_valid = model_first.predict(df_valid_final)

In [None]:
y_ans = np.zeros((len(df_train_final),199))

for i,p in enumerate(y):
    for j in range(199):
        if j-99>=p:
            y_ans[i][j]=1.0

In [None]:
train_cdf = np.histogram(preds, bins=199,
                 range=(-99,100), density=True)[0].cumsum()

In [None]:
print("Train score:",np.sum(np.power(train_cdf-y_ans,2))/(199*(len(df_train_final))))

In [None]:
valid_cdf = np.histogram(preds_valid, bins=199,
                 range=(-99,100), density=True)[0].cumsum()

In [None]:
y_ans_valid = np.zeros((len(df_valid_final),199))

for i,p in enumerate(y_valid):
    for j in range(199):
        if j-99>=p:
            y_ans_valid[i][j]=1.0

In [None]:
print("Valid score:",np.sum(np.power(valid_cdf-y_ans_valid,2))/(199*(len(df_valid_final))))

In [None]:
fi = rf_feat_importance(model_first,df_train_final)
fi

In [None]:
fi[fi['imp']>0.01]

In [None]:
fi[:15].plot('cols','imp','barh',figsize=(12,7))

In [None]:
df_train_final['GameClock'].describe()

In [None]:
plt.scatter(df_train_final['GameClock'],y)

In [None]:
plt.scatter(df_train_final['Distance'],y)

In [None]:
df['GameClock'].min(),df['GameClock'].max()

In [None]:
df['GameClock'].unique()