In [3]:
import os
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import matplotlib.pyplot as plt

In [4]:
raw_df = pd.read_csv('integrated_data.csv', low_memory=False)
raw_df

Unnamed: 0,player_name,pitcher,game_date,game_pk,pitch_of_game,at_bat_number,pitch_number,pitch_type,pitch_name,release_speed,...,az,sz_top,sz_bot,effective_speed,release_spin_rate,release_extension,release_pos_y,spin_axis,game_precedes_injury,ID
0,"Scherzer, Max",453286,2020-07-23,630851,1,1,1,FF,4-Seam Fastball,95.3,...,-15.689358,3.85,1.84,95.7,2484.0,6.3,54.21,226.0,False,0
1,"Scherzer, Max",453286,2020-07-23,630851,2,1,2,FF,4-Seam Fastball,96.1,...,-12.476899,3.58,1.75,95.9,2477.0,6.1,54.38,226.0,False,0
2,"Scherzer, Max",453286,2020-07-23,630851,3,1,3,FF,4-Seam Fastball,96.7,...,-14.099707,3.33,1.65,96.4,2421.0,6.2,54.29,224.0,False,0
3,"Scherzer, Max",453286,2020-07-23,630851,4,1,4,CU,Curveball,79.7,...,-39.867212,3.51,1.65,79.6,2846.0,6.2,54.28,60.0,False,0
4,"Scherzer, Max",453286,2020-07-23,630851,5,1,5,FF,4-Seam Fastball,96.1,...,-14.962333,3.41,1.56,96.3,2450.0,6.4,54.09,227.0,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2686486,"Pérez, Cionel",672335,2024-05-02,747044,273,71,1,SI,Sinker,96.7,...,-18.790202,3.37,1.62,96.3,2142.0,6.0,54.52,144.0,False,80501
2686487,"Pérez, Cionel",672335,2024-05-02,747044,274,71,2,CU,Curveball,82.8,...,-41.939864,3.36,1.61,82.2,2669.0,6.0,54.53,330.0,False,80501
2686488,"Pérez, Cionel",672335,2024-05-02,747044,275,71,3,SI,Sinker,96.6,...,-23.722864,3.31,1.57,96.2,2111.0,6.0,54.53,146.0,False,80501
2686489,"Pérez, Cionel",672335,2024-05-02,747044,276,71,4,SL,Slider,84.6,...,-41.547310,3.30,1.57,84.0,2628.0,6.0,54.49,331.0,False,80501


In [5]:
speed_df = raw_df[['player_name','pitcher', 'game_date', 'game_pk', 'ID', 'pitch_of_game', 'release_speed', 'game_precedes_injury']]
speed_df

Unnamed: 0,player_name,pitcher,game_date,game_pk,ID,pitch_of_game,release_speed,game_precedes_injury
0,"Scherzer, Max",453286,2020-07-23,630851,0,1,95.3,False
1,"Scherzer, Max",453286,2020-07-23,630851,0,2,96.1,False
2,"Scherzer, Max",453286,2020-07-23,630851,0,3,96.7,False
3,"Scherzer, Max",453286,2020-07-23,630851,0,4,79.7,False
4,"Scherzer, Max",453286,2020-07-23,630851,0,5,96.1,False
...,...,...,...,...,...,...,...,...
2686486,"Pérez, Cionel",672335,2024-05-02,747044,80501,273,96.7,False
2686487,"Pérez, Cionel",672335,2024-05-02,747044,80501,274,82.8,False
2686488,"Pérez, Cionel",672335,2024-05-02,747044,80501,275,96.6,False
2686489,"Pérez, Cionel",672335,2024-05-02,747044,80501,276,84.6,False


In [6]:
count_frame = pd.DataFrame(speed_df[['player_name', 'game_pk']].value_counts())
singles = count_frame[count_frame['count']==1]
singles.reset_index(inplace=True)
singles[['player_name', 'game_pk']]

Unnamed: 0,player_name,game_pk
0,"Record, Joe",747943
1,"Howard, Sam",634483
2,"Brogdon, Connor",633715
3,"McFarland, T.J.",633226
4,"Wilson, Steven",662234
...,...,...
215,"Moore, Matt",661842
216,"Maton, Phil",660899
217,"Garrett, Amir",633182
218,"Pressly, Ryan",632324


In [7]:
speed_df = pd.merge(speed_df, singles[['player_name', 'game_pk']], how='outer', indicator=True).query('_merge != "both"').drop('_merge', axis=1).reset_index(drop=True)
speed_df

# df2 = speed_df.drop(delete_row)
# df2

Unnamed: 0,player_name,pitcher,game_date,game_pk,ID,pitch_of_game,release_speed,game_precedes_injury
0,"Abad, Fernando",472551,2021-10-01,632268,30515,109,93.3,False
1,"Abad, Fernando",472551,2021-10-01,632268,30515,110,93.4,False
2,"Abad, Fernando",472551,2021-10-01,632268,30515,111,76.6,False
3,"Abad, Fernando",472551,2021-10-01,632268,30515,112,80.7,False
4,"Abad, Fernando",472551,2021-10-01,632268,30515,113,79.8,False
...,...,...,...,...,...,...,...,...
2686266,"deGrom, Jacob",594798,2023-03-30,718779,54811,110,97.5,False
2686267,"deGrom, Jacob",594798,2023-03-30,718779,54811,111,96.7,False
2686268,"deGrom, Jacob",594798,2023-03-30,718779,54811,112,97.9,False
2686269,"deGrom, Jacob",594798,2023-03-30,718779,54811,113,97.7,False


In [8]:
speed_df[speed_df['player_name'] == 'Milner, Hoby']

Unnamed: 0,player_name,pitcher,game_date,game_pk,ID,pitch_of_game,release_speed,game_precedes_injury
1628837,"Milner, Hoby",571948,2020-08-07,630920,1650,221,89.2,False
1628838,"Milner, Hoby",571948,2020-08-07,630920,1650,222,88.9,False
1628839,"Milner, Hoby",571948,2020-08-07,630920,1650,223,88.3,False
1628840,"Milner, Hoby",571948,2020-08-07,630920,1650,254,87.4,False
1628841,"Milner, Hoby",571948,2020-08-07,630920,1650,255,77.8,False
...,...,...,...,...,...,...,...,...
1631827,"Milner, Hoby",571948,2023-10-03,748582,75349,234,80.6,False
1631828,"Milner, Hoby",571948,2023-10-03,748582,75349,235,81.8,False
1631829,"Milner, Hoby",571948,2023-10-03,748582,75349,253,89.2,False
1631830,"Milner, Hoby",571948,2023-10-03,748582,75349,254,88.6,False


In [9]:
speed_df[speed_df['game_precedes_injury'] == True]


Unnamed: 0,player_name,pitcher,game_date,game_pk,ID,pitch_of_game,release_speed,game_precedes_injury
5593,"Abreu, Albert",656061,2022-08-20,661240,47858,262,97.1,True
5594,"Abreu, Albert",656061,2022-08-20,661240,47858,263,97.5,True
5595,"Abreu, Albert",656061,2022-08-20,661240,47858,264,97.9,True
5596,"Abreu, Albert",656061,2022-08-20,661240,47858,265,98.3,True
5597,"Abreu, Albert",656061,2022-08-20,661240,47858,266,89.0,True
...,...,...,...,...,...,...,...,...
2685865,"deGrom, Jacob",594798,2023-04-28,718388,57971,110,91.3,True
2685866,"deGrom, Jacob",594798,2023-04-28,718388,57971,111,91.2,True
2685867,"deGrom, Jacob",594798,2023-04-28,718388,57971,112,92.4,True
2685868,"deGrom, Jacob",594798,2023-04-28,718388,57971,113,96.6,True


In [10]:
speed_df['ID'].unique()

array([30515, 29992, 29716, ..., 56061, 55423, 54811])

In [11]:
first = speed_df[speed_df['ID'] == 0]
first



Unnamed: 0,player_name,pitcher,game_date,game_pk,ID,pitch_of_game,release_speed,game_precedes_injury
2112071,"Scherzer, Max",453286,2020-07-23,630851,0,1,95.3,False
2112072,"Scherzer, Max",453286,2020-07-23,630851,0,2,96.1,False
2112073,"Scherzer, Max",453286,2020-07-23,630851,0,3,96.7,False
2112074,"Scherzer, Max",453286,2020-07-23,630851,0,4,79.7,False
2112075,"Scherzer, Max",453286,2020-07-23,630851,0,5,96.1,False
...,...,...,...,...,...,...,...,...
2112165,"Scherzer, Max",453286,2020-07-23,630851,0,170,93.7,False
2112166,"Scherzer, Max",453286,2020-07-23,630851,0,171,94.8,False
2112167,"Scherzer, Max",453286,2020-07-23,630851,0,172,74.9,False
2112168,"Scherzer, Max",453286,2020-07-23,630851,0,173,85.1,False


In [12]:
predictor = first[['pitch_of_game']]
response = first[['release_speed']]

regr = linear_model.LinearRegression()
regr.fit(predictor, response)

print(regr.coef_)

[[-0.01492825]]


In [13]:
injured = speed_df[speed_df['ID'] == 31]
injured


Unnamed: 0,player_name,pitcher,game_date,game_pk,ID,pitch_of_game,release_speed,game_precedes_injury
1096749,"Holmes, Clay",605280,2020-07-24,630989,31,165,79.2,True
1096750,"Holmes, Clay",605280,2020-07-24,630989,31,166,80.2,True
1096751,"Holmes, Clay",605280,2020-07-24,630989,31,167,80.0,True
1096752,"Holmes, Clay",605280,2020-07-24,630989,31,168,80.3,True
1096753,"Holmes, Clay",605280,2020-07-24,630989,31,169,93.9,True
1096754,"Holmes, Clay",605280,2020-07-24,630989,31,194,91.0,True
1096755,"Holmes, Clay",605280,2020-07-24,630989,31,195,78.3,True
1096756,"Holmes, Clay",605280,2020-07-24,630989,31,196,91.4,True
1096757,"Holmes, Clay",605280,2020-07-24,630989,31,197,91.7,True
1096758,"Holmes, Clay",605280,2020-07-24,630989,31,198,79.3,True


In [14]:
predictor = injured[['pitch_of_game']]
response = injured[['release_speed']]

regr = linear_model.LinearRegression()
regr.fit(predictor, response)

regr.coef_[0][0]

0.021836199821745284

In [15]:
final_df = pd.DataFrame()
i = 0
for id in speed_df['ID'].unique():
    id_df = speed_df[speed_df['ID'] == id]
    
    predictor = id_df[['pitch_of_game']]
    response = id_df[['release_speed']]
    
    regr = linear_model.LinearRegression()
    regr.fit(predictor, response)
    
    slope = regr.coef_[0][0]
    mean = id_df[['release_speed']].mean(axis=1)
    std = id_df[['release_speed']].std()['release_speed']
    
    to_add_df = id_df[['player_name', 'pitcher', 'game_date', 'game_pk', 'ID', 'game_precedes_injury']].drop_duplicates()
    to_add_df['slope'] = slope
    to_add_df['mean'] = mean
    to_add_df['std'] = std
    
    final_df = pd.concat([final_df, to_add_df])
    
final_df

Unnamed: 0,player_name,pitcher,game_date,game_pk,ID,game_precedes_injury,slope,mean,std
0,"Abad, Fernando",472551,2021-10-01,632268,30515,False,-0.269091,93.3,7.589311
11,"Abad, Fernando",472551,2021-09-26,632334,29992,False,-0.249173,91.3,7.417653
31,"Abad, Fernando",472551,2021-09-24,632369,29716,False,-0.077019,93.0,8.359948
42,"Abad, Fernando",472551,2021-09-22,632382,29374,False,-0.455882,92.2,7.857046
58,"Abad, Fernando",472551,2021-09-21,632402,29277,False,1.908571,94.1,9.351506
...,...,...,...,...,...,...,...,...,...
2685870,"deGrom, Jacob",594798,2023-04-23,718453,57419,False,-0.010425,98.0,3.733292
2685950,"deGrom, Jacob",594798,2023-04-17,718539,56790,False,0.021726,98.2,4.030971
2686008,"deGrom, Jacob",594798,2023-04-11,718615,56061,False,-0.011180,99.1,4.072381
2686106,"deGrom, Jacob",594798,2023-04-05,718694,55423,False,-0.002812,98.5,4.025574


In [16]:
final_df2 = final_df[['slope', 'mean', 'std', 'game_precedes_injury']]
final_df2

Unnamed: 0,slope,mean,std,game_precedes_injury
0,-0.269091,93.3,7.589311,False
11,-0.249173,91.3,7.417653,False
31,-0.077019,93.0,8.359948,False
42,-0.455882,92.2,7.857046,False
58,1.908571,94.1,9.351506,False
...,...,...,...,...
2685870,-0.010425,98.0,3.733292,False
2685950,0.021726,98.2,4.030971,False
2686008,-0.011180,99.1,4.072381,False
2686106,-0.002812,98.5,4.025574,False


In [17]:
final_df2.to_csv('release_speed.csv', index=False)

In [18]:
final_df[final_df.isna().any(axis=1)]

Unnamed: 0,player_name,pitcher,game_date,game_pk,ID,game_precedes_injury,slope,mean,std


In [19]:
final_df

Unnamed: 0,player_name,pitcher,game_date,game_pk,ID,game_precedes_injury,slope,mean,std
0,"Abad, Fernando",472551,2021-10-01,632268,30515,False,-0.269091,93.3,7.589311
11,"Abad, Fernando",472551,2021-09-26,632334,29992,False,-0.249173,91.3,7.417653
31,"Abad, Fernando",472551,2021-09-24,632369,29716,False,-0.077019,93.0,8.359948
42,"Abad, Fernando",472551,2021-09-22,632382,29374,False,-0.455882,92.2,7.857046
58,"Abad, Fernando",472551,2021-09-21,632402,29277,False,1.908571,94.1,9.351506
...,...,...,...,...,...,...,...,...,...
2685870,"deGrom, Jacob",594798,2023-04-23,718453,57419,False,-0.010425,98.0,3.733292
2685950,"deGrom, Jacob",594798,2023-04-17,718539,56790,False,0.021726,98.2,4.030971
2686008,"deGrom, Jacob",594798,2023-04-11,718615,56061,False,-0.011180,99.1,4.072381
2686106,"deGrom, Jacob",594798,2023-04-05,718694,55423,False,-0.002812,98.5,4.025574


In [20]:


predictor = final_df2[['slope', 'mean', 'std']]
response = final_df2['game_precedes_injury']

X_train, X_test, y_train, y_test = train_test_split(predictor, response, test_size=0.20, random_state=42)

log_regr = linear_model.LogisticRegression()

log_regr.fit(X_train, y_train)

y_pred = log_regr.predict(X_test)
y_train_pred = log_regr.predict(X_train)

print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[63659     0]
 [  601     0]]
[[15913     0]
 [  153     0]]
              precision    recall  f1-score   support

       False       0.99      1.00      1.00     15913
        True       0.00      0.00      0.00       153

    accuracy                           0.99     16066
   macro avg       0.50      0.50      0.50     16066
weighted avg       0.98      0.99      0.99     16066



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)
y_train_pred = gnb.predict(X_train)

print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[63659     0]
 [  601     0]]
[[15913     0]
 [  153     0]]
              precision    recall  f1-score   support

       False       0.99      1.00      1.00     15913
        True       0.00      0.00      0.00       153

    accuracy                           0.99     16066
   macro avg       0.50      0.50      0.50     16066
weighted avg       0.98      0.99      0.99     16066



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
svc = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)
y_train_pred = svc.predict(X_train)

print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[63659     0]
 [  601     0]]
[[15913     0]
 [  153     0]]
              precision    recall  f1-score   support

       False       0.99      1.00      1.00     15913
        True       0.00      0.00      0.00       153

    accuracy                           0.99     16066
   macro avg       0.50      0.50      0.50     16066
weighted avg       0.98      0.99      0.99     16066



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
