In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import matthews_corrcoef
from xgboost import XGBClassifier

In [2]:
pd.set_option("display.max_columns",100)
pd.set_option("display.max_rows",1000)
pd.options.display.max_seq_items = 2000

In [3]:
df_submission = pd.read_csv('sample_submission.csv')
data_track = pd.read_csv('test_player_tracking.csv')

In [4]:
data_track = data_track.drop(['game_key','play_id','datetime'],axis=1)

In [5]:
df_submission.head()

Unnamed: 0,contact_id,contact
0,58168_003392_0_38590_43854,0
1,58168_003392_0_38590_41257,0
2,58168_003392_0_38590_41944,0
3,58168_003392_0_38590_42386,0
4,58168_003392_0_38590_47944,0


In [6]:
# df_submission['game_key'] = df_submission.contact_id.str.split("_", expand=True)[0]

In [7]:
# df_submission['play_id'] = df_submission.contact_id.str.split("_", expand=True)[1]

In [6]:
df_submission['game_play'] = df_submission.contact_id.str.split("_", expand=True)[0] \
                             + '_' \
                             + df_submission.contact_id.str.split("_", expand=True)[1]

In [7]:
df_submission['step'] = df_submission.contact_id.str.split("_", expand=True)[2]

In [8]:
df_submission['nfl_player_id_1'] = df_submission.contact_id.str.split("_", expand=True)[3]

In [9]:
df_submission['nfl_player_id_2'] = df_submission.contact_id.str.split("_", expand=True)[4]

In [10]:
df_submission.head()

Unnamed: 0,contact_id,contact,game_play,step,nfl_player_id_1,nfl_player_id_2
0,58168_003392_0_38590_43854,0,58168_003392,0,38590,43854
1,58168_003392_0_38590_41257,0,58168_003392,0,38590,41257
2,58168_003392_0_38590_41944,0,58168_003392,0,38590,41944
3,58168_003392_0_38590_42386,0,58168_003392,0,38590,42386
4,58168_003392_0_38590_47944,0,58168_003392,0,38590,47944


### Data Merging

In [11]:
data_track.dtypes

game_play         object
nfl_player_id      int64
step               int64
team              object
position          object
jersey_number      int64
x_position       float64
y_position       float64
speed            float64
distance         float64
direction        float64
orientation      float64
acceleration     float64
sa               float64
dtype: object

In [12]:
data_track['nfl_player_id'] = data_track['nfl_player_id'].astype('str')
# data_track['game_key'] = data_track['game_key'].astype('str')
# data_track['play_id'] = data_track['play_id'].astype('str')
data_track['step'] = data_track['step'].astype('str')

In [13]:
data_track.dtypes

game_play         object
nfl_player_id     object
step              object
team              object
position          object
jersey_number      int64
x_position       float64
y_position       float64
speed            float64
distance         float64
direction        float64
orientation      float64
acceleration     float64
sa               float64
dtype: object

In [14]:
df_submission.dtypes

contact_id         object
contact             int64
game_play          object
step               object
nfl_player_id_1    object
nfl_player_id_2    object
dtype: object

In [15]:
df_submission['nfl_player_id_1'] = df_submission['nfl_player_id_1'].astype('str')
df_submission['nfl_player_id_2'] = df_submission['nfl_player_id_2'].astype('str')
# df_submission['game_key'] = df_submission['game_key'].astype('str')
# df_submission['play_id'] = df_submission['play_id'].astype('str')
df_submission['step'] = df_submission['step'].astype('str')

In [16]:
df_submission.dtypes

contact_id         object
contact             int64
game_play          object
step               object
nfl_player_id_1    object
nfl_player_id_2    object
dtype: object

In [17]:
df_submission.shape

(49588, 6)

In [18]:
data_train = df_submission.merge(data_track, left_on = ['game_play', 'nfl_player_id_1', 'step'], \
                 right_on = ['game_play', 'nfl_player_id', 'step'], how ='left',suffixes=[None,'_id_1'])

In [19]:
data_train.head()

Unnamed: 0,contact_id,contact,game_play,step,nfl_player_id_1,nfl_player_id_2,nfl_player_id,team,position,jersey_number,x_position,y_position,speed,distance,direction,orientation,acceleration,sa
0,58168_003392_0_38590_43854,0,58168_003392,0,38590,43854,38590,home,G,70,40.33,25.28,0.52,0.06,141.08,100.37,0.59,0.58
1,58168_003392_0_38590_41257,0,58168_003392,0,38590,41257,38590,home,G,70,40.33,25.28,0.52,0.06,141.08,100.37,0.59,0.58
2,58168_003392_0_38590_41944,0,58168_003392,0,38590,41944,38590,home,G,70,40.33,25.28,0.52,0.06,141.08,100.37,0.59,0.58
3,58168_003392_0_38590_42386,0,58168_003392,0,38590,42386,38590,home,G,70,40.33,25.28,0.52,0.06,141.08,100.37,0.59,0.58
4,58168_003392_0_38590_47944,0,58168_003392,0,38590,47944,38590,home,G,70,40.33,25.28,0.52,0.06,141.08,100.37,0.59,0.58


In [20]:
data_train.shape

(49588, 18)

In [21]:
data_train.isna().sum()

contact_id         0
contact            0
game_play          0
step               0
nfl_player_id_1    0
nfl_player_id_2    0
nfl_player_id      0
team               0
position           0
jersey_number      0
x_position         0
y_position         0
speed              0
distance           0
direction          0
orientation        0
acceleration       0
sa                 0
dtype: int64

In [22]:
# data_train = data_train.merge(data_track, left_on = ['game_play', 'nfl_player_id_2','step'], \
#                  right_on = ['game_play', 'nfl_player_id', 'step'], how ='left',suffixes=['_id_1','_id_2'])

In [23]:
data_train.isna().sum()

contact_id         0
contact            0
game_play          0
step               0
nfl_player_id_1    0
nfl_player_id_2    0
nfl_player_id      0
team               0
position           0
jersey_number      0
x_position         0
y_position         0
speed              0
distance           0
direction          0
orientation        0
acceleration       0
sa                 0
dtype: int64

In [24]:
data_train.head()

Unnamed: 0,contact_id,contact,game_play,step,nfl_player_id_1,nfl_player_id_2,nfl_player_id,team,position,jersey_number,x_position,y_position,speed,distance,direction,orientation,acceleration,sa
0,58168_003392_0_38590_43854,0,58168_003392,0,38590,43854,38590,home,G,70,40.33,25.28,0.52,0.06,141.08,100.37,0.59,0.58
1,58168_003392_0_38590_41257,0,58168_003392,0,38590,41257,38590,home,G,70,40.33,25.28,0.52,0.06,141.08,100.37,0.59,0.58
2,58168_003392_0_38590_41944,0,58168_003392,0,38590,41944,38590,home,G,70,40.33,25.28,0.52,0.06,141.08,100.37,0.59,0.58
3,58168_003392_0_38590_42386,0,58168_003392,0,38590,42386,38590,home,G,70,40.33,25.28,0.52,0.06,141.08,100.37,0.59,0.58
4,58168_003392_0_38590_47944,0,58168_003392,0,38590,47944,38590,home,G,70,40.33,25.28,0.52,0.06,141.08,100.37,0.59,0.58


In [25]:
# data_train = data_train.drop(['nfl_player_id_id_1','nfl_player_id_id_2'],axis=1)

### Feature Engineering

In [26]:
data_track_eng = data_track[['game_play', 'nfl_player_id', 'step'
                            ,'x_position', 'y_position', 'speed', 'distance', 'direction', 'orientation'
                            ,'acceleration', 'sa']]

In [27]:
data_track_eng = data_track_eng.sort_values(['game_play', 'nfl_player_id', 'step'])

In [28]:
features = ['x_position', 'y_position', 'speed', 'distance', 'direction',\
                  'orientation', 'acceleration', 'sa']

features_lag1 = ['x_position_lag1', 'y_position_lag1', 'speed_lag1',\
                'distance_lag1', 'direction_lag1', 'orientation_lag1',\
                'acceleration_lag1', 'sa_lag1']

features_lag2 = ['x_position_lag2', 'y_position_lag2', 'speed_lag2',\
                'distance_lag2', 'direction_lag2', 'orientation_lag2',\
                'acceleration_lag2', 'sa_lag2']

In [29]:
data_track_eng[features_lag1] = \
                data_track_eng.groupby(['game_play', 'nfl_player_id'])\
                [features].shift(1)

In [30]:
data_track_eng[features_lag2] = \
                data_track_eng.groupby(['game_play', 'nfl_player_id'])\
                [features].shift(2)

In [31]:
feats_diff_within = ['speed','distance', 'direction', 'orientation', 'acceleration', 'sa']

for feats in feats_diff_within:
    data_track_eng[feats + '_diff_w' + '_lag1' + '_lag2'] = data_track_eng[feats+'_lag1'] - data_track_eng[feats+'_lag2']
    data_track_eng[feats + '_diff_w' + '_lag1'] = data_track_eng[feats] - data_track_eng[feats+'_lag1']

In [32]:
# columns to keep

keys = ['game_play', 'nfl_player_id', 'step']

feats_diff_w_cols = [col + '_diff_w' + '_lag1' + '_lag2' for col in feats_diff_within] + \
                    [col + '_diff_w' + '_lag1' for col in feats_diff_within]

feats_lag = ['x_position_lag1', 'y_position_lag1', 'x_position_lag2', 'y_position_lag2']

all_cols = keys + feats_diff_w_cols + feats_lag

data_track_eng[all_cols]

Unnamed: 0,game_play,nfl_player_id,step,speed_diff_w_lag1_lag2,distance_diff_w_lag1_lag2,direction_diff_w_lag1_lag2,orientation_diff_w_lag1_lag2,acceleration_diff_w_lag1_lag2,sa_diff_w_lag1_lag2,speed_diff_w_lag1,distance_diff_w_lag1,direction_diff_w_lag1,orientation_diff_w_lag1,acceleration_diff_w_lag1,sa_diff_w_lag1,x_position_lag1,y_position_lag1,x_position_lag2,y_position_lag2
12815,58168_003392,37084,-1,,,,,,,,,,,,,,,,
12624,58168_003392,37084,-10,,,,,,,-0.16,-0.02,-39.17,6.59,-0.68,-0.66,41.96,20.10,,
10637,58168_003392,37084,-100,-0.16,-0.02,-39.17,6.59,-0.68,-0.66,0.33,0.04,63.94,-10.75,0.47,0.06,42.01,20.22,41.96,20.10
10613,58168_003392,37084,-101,0.33,0.04,63.94,-10.75,0.47,0.06,-0.07,-0.01,-13.76,0.00,-0.07,-0.20,43.12,20.05,42.01,20.22
10593,58168_003392,37084,-102,-0.07,-0.01,-13.76,0.00,-0.07,-0.20,-0.01,-0.01,-4.79,1.64,0.08,-0.02,43.18,20.05,43.12,20.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8086,58172_003247,52939,95,0.06,0.01,-1.53,4.54,-0.41,-0.07,-0.01,0.00,-1.30,-0.90,-0.28,-0.21,45.69,16.49,46.03,17.02
8103,58172_003247,52939,96,-0.01,0.00,-1.30,-0.90,-0.28,-0.21,0.03,0.00,-0.32,0.00,-0.38,-0.09,45.37,15.96,45.69,16.49
8137,58172_003247,52939,97,0.03,0.00,-0.32,0.00,-0.38,-0.09,-0.04,0.00,-0.61,0.00,-0.03,-0.20,45.04,15.41,45.37,15.96
8146,58172_003247,52939,98,-0.04,0.00,-0.61,0.00,-0.03,-0.20,0.02,0.00,-0.99,1.89,-0.06,0.05,44.73,14.87,45.04,15.41


In [33]:
# merging for player 1

feats_cols_id_1 = feats_diff_w_cols + feats_lag

feats_cols_id_1 = [feats + '_id_1' for feats in feats_cols_id_1]

new_cols_name = keys + feats_cols_id_1

data_track_eng_id_1 = data_track_eng[all_cols]

data_track_eng_id_1.columns = new_cols_name

data_track_eng_id_1.head()

Unnamed: 0,game_play,nfl_player_id,step,speed_diff_w_lag1_lag2_id_1,distance_diff_w_lag1_lag2_id_1,direction_diff_w_lag1_lag2_id_1,orientation_diff_w_lag1_lag2_id_1,acceleration_diff_w_lag1_lag2_id_1,sa_diff_w_lag1_lag2_id_1,speed_diff_w_lag1_id_1,distance_diff_w_lag1_id_1,direction_diff_w_lag1_id_1,orientation_diff_w_lag1_id_1,acceleration_diff_w_lag1_id_1,sa_diff_w_lag1_id_1,x_position_lag1_id_1,y_position_lag1_id_1,x_position_lag2_id_1,y_position_lag2_id_1
12815,58168_003392,37084,-1,,,,,,,,,,,,,,,,
12624,58168_003392,37084,-10,,,,,,,-0.16,-0.02,-39.17,6.59,-0.68,-0.66,41.96,20.1,,
10637,58168_003392,37084,-100,-0.16,-0.02,-39.17,6.59,-0.68,-0.66,0.33,0.04,63.94,-10.75,0.47,0.06,42.01,20.22,41.96,20.1
10613,58168_003392,37084,-101,0.33,0.04,63.94,-10.75,0.47,0.06,-0.07,-0.01,-13.76,0.0,-0.07,-0.2,43.12,20.05,42.01,20.22
10593,58168_003392,37084,-102,-0.07,-0.01,-13.76,0.0,-0.07,-0.2,-0.01,-0.01,-4.79,1.64,0.08,-0.02,43.18,20.05,43.12,20.05


In [38]:
# # merging for player 2

# feats_cols_id_2 = feats_diff_w_cols + feats_lag

# feats_cols_id_2 = [feats + '_id_2' for feats in feats_cols_id_2]

# new_cols_name = keys + feats_cols_id_2

# data_track_eng_id_2 = data_track_eng[all_cols]

# data_track_eng_id_2.columns = new_cols_name

# data_track_eng_id_2.head()

Unnamed: 0,game_play,nfl_player_id,step,speed_diff_w_lag1_lag2_id_2,distance_diff_w_lag1_lag2_id_2,direction_diff_w_lag1_lag2_id_2,orientation_diff_w_lag1_lag2_id_2,acceleration_diff_w_lag1_lag2_id_2,sa_diff_w_lag1_lag2_id_2,speed_diff_w_lag1_id_2,distance_diff_w_lag1_id_2,direction_diff_w_lag1_id_2,orientation_diff_w_lag1_id_2,acceleration_diff_w_lag1_id_2,sa_diff_w_lag1_id_2,x_position_lag1_id_2,y_position_lag1_id_2,x_position_lag2_id_2,y_position_lag2_id_2
12815,58168_003392,37084,-1,,,,,,,,,,,,,,,,
12624,58168_003392,37084,-10,,,,,,,-0.16,-0.02,-39.17,6.59,-0.68,-0.66,41.96,20.1,,
10637,58168_003392,37084,-100,-0.16,-0.02,-39.17,6.59,-0.68,-0.66,0.33,0.04,63.94,-10.75,0.47,0.06,42.01,20.22,41.96,20.1
10613,58168_003392,37084,-101,0.33,0.04,63.94,-10.75,0.47,0.06,-0.07,-0.01,-13.76,0.0,-0.07,-0.2,43.12,20.05,42.01,20.22
10593,58168_003392,37084,-102,-0.07,-0.01,-13.76,0.0,-0.07,-0.2,-0.01,-0.01,-4.79,1.64,0.08,-0.02,43.18,20.05,43.12,20.05


In [34]:
data_train = data_train.merge(data_track_eng_id_1, left_on=['game_play', 'nfl_player_id_1', 'step']\
                 , right_on=keys, how='left')

In [40]:
# data_train = data_train.merge(data_track_eng_id_2, left_on=['game_play', 'nfl_player_id_2', 'step']\
#                  , right_on=keys, how='left')

### P-G Predictions

In [35]:
data_train_pg = data_train[data_train.nfl_player_id_2 =='G']

In [39]:
target = ['contact']

features_current_id_1 = ['speed', 'distance', 'direction', 'orientation', 'acceleration', \
                         'sa']

# features_current_id_2 = ['speed_id_2', 'distance_id_2', 'direction_id_2', 'orientation_id_2', 'acceleration_id_2', \
#                          'sa_id_2']


features_lag_id_1 = ['speed_diff_w_lag1_lag2_id_1', 'distance_diff_w_lag1_lag2_id_1', 'direction_diff_w_lag1_lag2_id_1',\
                     'orientation_diff_w_lag1_lag2_id_1', 'acceleration_diff_w_lag1_lag2_id_1', \
                     'sa_diff_w_lag1_lag2_id_1', 'speed_diff_w_lag1_id_1', 'distance_diff_w_lag1_id_1', \
                     'direction_diff_w_lag1_id_1', 'orientation_diff_w_lag1_id_1', 'acceleration_diff_w_lag1_id_1', \
                     'sa_diff_w_lag1_id_1']

# features_lag_id_2 = ['speed_diff_w_lag1_lag2_id_2', 'distance_diff_w_lag1_lag2_id_2', 'direction_diff_w_lag1_lag2_id_2',\
#                      'orientation_diff_w_lag1_lag2_id_2', 'acceleration_diff_w_lag1_lag2_id_2', \
#                      'sa_diff_w_lag1_lag2_id_2', 'speed_diff_w_lag1_id_2', 'distance_diff_w_lag1_id_2', \
#                      'direction_diff_w_lag1_id_2', 'orientation_diff_w_lag1_id_2', 'acceleration_diff_w_lag1_id_2', \
#                      'sa_diff_w_lag1_id_2']

# features_dist = ['dist_lag2', 'dist_lag1', 'dist', 'dist_lag1_lag2', 'dist_current_lag1']

all_cols = features_current_id_1 + features_lag_id_1 + target

In [40]:
#For Kaggle
# The predictions would be put in contact column in this data frame
data_train_pg.to_csv("train_with_contact_id_pg.csv",index=False)

In [41]:
data_train_pg_m = data_train_pg[all_cols]

### Load the training data

In [42]:
df_train_preprocessed = pd.read_csv('training_pg.csv')

### Train XGB

In [43]:
X = df_train_preprocessed.drop('contact', axis=1)
y = df_train_preprocessed['contact']

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [45]:
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [46]:
y_pred_xgb = model.predict(X_test)

In [47]:
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

[[78377   401]
 [ 2762   587]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     78778
           1       0.59      0.18      0.27      3349

    accuracy                           0.96     82127
   macro avg       0.78      0.59      0.63     82127
weighted avg       0.95      0.96      0.95     82127



In [48]:
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

[[78377   401]
 [ 2762   587]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     78778
           1       0.59      0.18      0.27      3349

    accuracy                           0.96     82127
   macro avg       0.78      0.59      0.63     82127
weighted avg       0.95      0.96      0.95     82127



In [49]:
matthews_corrcoef(y_test, y_pred_xgb)

0.30873825856126463

### Probability cutoff to be set at 0.35

In [50]:
probs = model.predict_proba(X_test)

In [51]:
y_pred_035 = np.where(probs >0.35, 1,0)[:,1]

In [52]:
matthews_corrcoef(y_test, y_pred_035)

0.376961231492207

### Predicting on Data

#### Train on complete data

In [53]:
model_complete = XGBClassifier()
model_complete.fit(X, y)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [54]:
data_predict = data_train_pg_m

In [55]:
data_predict.columns

Index(['speed', 'distance', 'direction', 'orientation', 'acceleration', 'sa',
       'speed_diff_w_lag1_lag2_id_1', 'distance_diff_w_lag1_lag2_id_1',
       'direction_diff_w_lag1_lag2_id_1', 'orientation_diff_w_lag1_lag2_id_1',
       'acceleration_diff_w_lag1_lag2_id_1', 'sa_diff_w_lag1_lag2_id_1',
       'speed_diff_w_lag1_id_1', 'distance_diff_w_lag1_id_1',
       'direction_diff_w_lag1_id_1', 'orientation_diff_w_lag1_id_1',
       'acceleration_diff_w_lag1_id_1', 'sa_diff_w_lag1_id_1', 'contact'],
      dtype='object')

In [56]:
[True for x,y in zip(data_predict.drop('contact',axis =1).columns,data_predict.columns) if x==y]

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

In [57]:
### Writing out data_predict for kaggle

data_predict.to_csv('data_predict_pg.csv',index=False)
data_predict = pd.read_csv('data_predict_pg.csv')

In [58]:
probs_predict = model_complete.predict_proba(data_predict.drop('contact',axis =1))

In [59]:
data_predict['contact'] = np.where(probs_predict >0.35, 1,0)[:,1]

In [60]:
data_predict

Unnamed: 0,speed,distance,direction,orientation,acceleration,sa,speed_diff_w_lag1_lag2_id_1,distance_diff_w_lag1_lag2_id_1,direction_diff_w_lag1_lag2_id_1,orientation_diff_w_lag1_lag2_id_1,acceleration_diff_w_lag1_lag2_id_1,sa_diff_w_lag1_lag2_id_1,speed_diff_w_lag1_id_1,distance_diff_w_lag1_id_1,direction_diff_w_lag1_id_1,orientation_diff_w_lag1_id_1,acceleration_diff_w_lag1_id_1,sa_diff_w_lag1_id_1,contact
0,0.52,0.06,141.08,100.37,0.59,0.58,0.06,0.01,3.16,0.00,0.17,-0.16,0.09,0.01,81.07,22.47,-0.23,1.30,1
1,0.74,0.06,263.92,294.74,1.74,1.74,-0.09,-0.01,13.12,2.70,-0.13,-0.23,0.27,0.02,54.80,13.36,0.64,1.08,0
2,0.67,0.07,180.38,270.88,0.34,0.08,0.13,0.02,0.97,-1.38,-0.16,0.16,-0.47,-0.05,161.03,26.01,-0.47,0.88,1
3,0.68,0.08,234.17,282.07,0.81,0.81,-0.03,-0.01,-32.17,-1.58,-0.04,-0.16,0.62,0.07,107.00,-1.97,0.68,0.91,0
4,0.61,0.06,221.19,269.45,1.11,1.10,-0.05,0.00,4.01,-3.06,0.07,0.10,0.21,0.02,208.74,-11.72,0.60,0.61,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4307,1.98,0.20,245.00,255.31,2.07,-0.76,-0.07,0.00,4.83,3.88,-0.31,0.17,-0.14,-0.02,8.05,5.89,0.13,-0.01,0
4308,1.41,0.15,163.22,185.42,0.75,-0.58,-0.15,-0.02,6.18,4.56,0.57,-0.42,0.00,0.01,0.05,1.12,-0.19,0.20,0
4309,2.29,0.24,211.73,215.11,2.43,-2.43,-0.28,-0.02,-0.05,0.00,0.00,0.00,-0.25,-0.03,-0.66,0.00,-0.11,0.11,0
4310,1.51,0.15,141.48,296.90,0.74,0.71,0.20,0.01,-0.33,7.50,0.38,0.68,0.09,0.02,3.60,-2.92,0.11,0.10,0


In [61]:
data_train_pg['contact']=np.where(probs_predict >0.35, 1,0)[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_train_pg['contact']=np.where(probs_predict >0.35, 1,0)[:,1]


In [62]:
data_train_pg.head()

Unnamed: 0,contact_id,contact,game_play,step,nfl_player_id_1,nfl_player_id_2,nfl_player_id_x,team,position,jersey_number,x_position,y_position,speed,distance,direction,orientation,acceleration,sa,nfl_player_id_y,speed_diff_w_lag1_lag2_id_1,distance_diff_w_lag1_lag2_id_1,direction_diff_w_lag1_lag2_id_1,orientation_diff_w_lag1_lag2_id_1,acceleration_diff_w_lag1_lag2_id_1,sa_diff_w_lag1_lag2_id_1,speed_diff_w_lag1_id_1,distance_diff_w_lag1_id_1,direction_diff_w_lag1_id_1,orientation_diff_w_lag1_id_1,acceleration_diff_w_lag1_id_1,sa_diff_w_lag1_id_1,x_position_lag1_id_1,y_position_lag1_id_1,x_position_lag2_id_1,y_position_lag2_id_1
231,58168_003392_0_38590_G,1,58168_003392,0,38590,G,38590,home,G,70,40.33,25.28,0.52,0.06,141.08,100.37,0.59,0.58,38590,0.06,0.01,3.16,0.0,0.17,-0.16,0.09,0.01,81.07,22.47,-0.23,1.3,39.78,25.15,39.82,25.17
232,58168_003392_0_43854_G,0,58168_003392,0,43854,G,43854,away,OLB,57,41.99,16.79,0.74,0.06,263.92,294.74,1.74,1.74,43854,-0.09,-0.01,13.12,2.7,-0.13,-0.23,0.27,0.02,54.8,13.36,0.64,1.08,42.43,17.47,42.41,17.42
233,58168_003392_0_41257_G,1,58168_003392,0,41257,G,41257,away,CB,21,45.77,15.59,0.67,0.07,180.38,270.88,0.34,0.08,41257,0.13,0.02,0.97,-1.38,-0.16,0.16,-0.47,-0.05,161.03,26.01,-0.47,0.88,46.43,26.84,46.46,26.94
234,58168_003392_0_41944_G,0,58168_003392,0,41944,G,41944,away,DT,92,42.0,22.85,0.68,0.08,234.17,282.07,0.81,0.81,41944,-0.03,-0.01,-32.17,-1.58,-0.04,-0.16,0.62,0.07,107.0,-1.97,0.68,0.91,42.67,23.26,42.67,23.24
235,58168_003392_0_42386_G,0,58168_003392,0,42386,G,42386,away,ILB,55,45.87,23.89,0.61,0.06,221.19,269.45,1.11,1.1,42386,-0.05,0.0,4.01,-3.06,0.07,0.1,0.21,0.02,208.74,-11.72,0.6,0.61,45.2,22.21,45.2,22.25


In [63]:
data_train_pg.contact.value_counts()

0    3779
1     533
Name: contact, dtype: int64

In [65]:
data_train_pg[['contact_id','contact']].to_csv('submissions_pg.csv', index = False)