In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import matthews_corrcoef
from xgboost import XGBClassifier

### Load the training data

In [3]:
df_train_preprocessed = pd.read_csv('training_pp.csv')

### Train XGB

In [4]:
X = df_train_preprocessed.drop('contact', axis=1)
y = df_train_preprocessed['contact']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [6]:
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [7]:
y_pred_xgb = model.predict(X_test)

In [8]:
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

[[850463   2097]
 [  3634   6003]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    852560
           1       0.74      0.62      0.68      9637

    accuracy                           0.99    862197
   macro avg       0.87      0.81      0.84    862197
weighted avg       0.99      0.99      0.99    862197



In [9]:
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

[[850463   2097]
 [  3634   6003]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    852560
           1       0.74      0.62      0.68      9637

    accuracy                           0.99    862197
   macro avg       0.87      0.81      0.84    862197
weighted avg       0.99      0.99      0.99    862197



In [10]:
matthews_corrcoef(y_test, y_pred_xgb)

0.676153838048378

### Probability cutoff to be set at 0.35

In [11]:
probs = model.predict_proba(X_test)

In [12]:
y_pred_035 = np.where(probs >0.35, 1,0)[:,1]

In [13]:
matthews_corrcoef(y_test, y_pred_035)

0.6988837810231529

In [32]:
pd.Series(y_pred_035).value_counts(normalize=True)

0    0.987096
1    0.012904
dtype: float64

### Predicting on sample_submission data with features

In [15]:
model_complete = XGBClassifier()
model_complete.fit(X, y)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [14]:
data_predict = pd.read_csv('data_predict.csv')

In [18]:
data_predict_contact_id = pd.read_csv('train_with_contact_id.csv')

In [16]:
probs_predict = model_complete.predict_proba(data_predict.drop('contact',axis =1))

In [17]:
data_predict['contact'] = np.where(probs_predict >0.35, 1,0)[:,1]

In [19]:
data_predict_contact_id['contact']=np.where(probs_predict >0.35, 1,0)[:,1]

In [20]:
data_predict_contact_id.head()

Unnamed: 0,contact_id,contact,game_play,step,nfl_player_id_1,nfl_player_id_2,team_id_1,position_id_1,jersey_number_id_1,x_position_id_1,...,sa_diff_w_lag1_id_2,x_position_lag1_id_2,y_position_lag1_id_2,x_position_lag2_id_2,y_position_lag2_id_2,dist_lag2,dist_lag1,dist,dist_lag1_lag2,dist_current_lag1
0,58168_003392_0_38590_43854,0,58168_003392,0,38590,43854,home,G,70,40.33,...,1.08,42.43,17.47,42.41,17.42,8.171328,8.12434,8.650763,-0.046988,0.526423
1,58168_003392_0_38590_41257,0,58168_003392,0,38590,41257,home,G,70,40.33,...,0.88,46.43,26.84,46.46,26.94,6.871863,6.861385,11.112592,-0.010478,4.251207
2,58168_003392_0_38590_41944,0,58168_003392,0,38590,41944,home,G,70,40.33,...,0.91,42.67,23.26,42.67,23.24,3.442005,3.453143,2.948525,0.011138,-0.504618
3,58168_003392_0_38590_42386,0,58168_003392,0,38590,42386,home,G,70,40.33,...,0.61,45.2,22.21,45.2,22.25,6.12134,6.166036,5.711716,0.044696,-0.45432
4,58168_003392_0_38590_47944,0,58168_003392,0,38590,47944,home,G,70,40.33,...,1.36,43.11,24.55,43.06,24.66,3.279893,3.383622,2.19611,0.103729,-1.187512


In [33]:
data_predict_contact_id.contact.value_counts(normalize=True)

0    0.986306
1    0.013694
Name: contact, dtype: float64

In [25]:
data_predict_contact_id[['contact_id','contact']].to_csv('submissions_pp.csv', index = False)

In [36]:
df_pp = data_predict_contact_id[['contact_id','contact']]

#### Loading Reubens

In [34]:
data_g = pd.read_parquet('test_data_v5_predicted_labels.parquet')

In [35]:
data_g.contact.value_counts()

0    4058
1     254
Name: contact, dtype: int64

In [41]:
df_submit = pd.concat([df_pp,data_g], ignore_index=False)

In [52]:
df_pp.to_csv('prediction_pp.csv',index=False)

In [45]:
df_sample = pd.read_csv('sample_submission.csv')

In [42]:
df_submit.to_csv('submission.csv', index=False)

In [48]:
df_submit_v2 = df_sample.drop('contact',axis=1).merge(df_submit,on=['contact_id'],how='left')

In [49]:
df_submit_v2.head()

Unnamed: 0,contact_id,contact
0,58168_003392_0_38590_43854,0
1,58168_003392_0_38590_41257,0
2,58168_003392_0_38590_41944,0
3,58168_003392_0_38590_42386,0
4,58168_003392_0_38590_47944,0


In [51]:
np.sum(df_submit_v2.contact_id == df_sample.contact_id)

49588