#### Imports

In [2]:
from package.package import *

def scale_data(train, test=''):    # Also adding scaling function.
    scaler = StandardScaler()

    train_data = scaler.fit_transform(train)
    
    if type(test) != str:
        test_data = scaler.transform(test)
        return train_data, test_data
    else:
        return train_data


Unnamed: 0.1,Unnamed: 0,Team,League,Year,OBP,SLG,BA,Playoffs,ERA,ERA+,FIP,HR,RBIs,Ks
0,1231,WSA,AL,1962,0.308,0.373,0.250,0,4.04,101,4.18,132,566,771
1,1230,STL,NL,1962,0.335,0.394,0.271,0,3.55,121,3.81,137,707,914
2,1229,SFG,NL,1962,0.341,0.441,0.278,1,3.79,101,3.81,204,807,886
3,1228,PIT,NL,1962,0.321,0.394,0.268,0,3.37,117,3.46,108,655,897
4,1227,PHI,NL,1962,0.330,0.390,0.260,0,4.28,91,4.13,142,658,863
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1467,1467,WSN,,2021,0.337,0.417,0.258,1,4.80,84,4.87,182,686,1346
1468,1468,PIT,,2021,0.309,0.364,0.236,1,5.08,84,4.74,124,570,1312
1469,1469,TEX,,2021,0.294,0.375,0.232,1,4.79,92,4.76,167,598,1239
1470,1470,BAL,,2021,0.304,0.402,0.239,1,5.84,77,5.15,195,632,1234


#### Prepare data for simulation

In [3]:
target = (dataset['Playoffs'])
data_all = (dataset[['OBP', 'SLG', 'BA', 'ERA', 'ERA+', 'FIP', 'Ks', 'HR', 'RBIs']])

display(target)
display(data_all)

# ---- VARIABLE DESCRIPTIONS ----
# OBP:          on-base percentage
# SLG:          slugging percentage
# BA:           batting average
# Playoffs:     binary (if team made playoffs or not)
# ERA:          earned run average
# ERA+:         earned run average +
# FIP:          fielding independent pitching
# HR:           home runs
# RBIs:         runs batted in
# Ks:           strikeouts

target = target.to_numpy()
data_all = data_all.to_numpy()


0       0
1       0
2       1
3       0
4       0
       ..
1467    1
1468    1
1469    1
1470    1
1471    1
Name: Playoffs, Length: 1472, dtype: int64

Unnamed: 0,OBP,SLG,BA,ERA,ERA+,FIP,Ks,HR,RBIs
0,0.308,0.373,0.250,4.04,101,4.18,771,132,566
1,0.335,0.394,0.271,3.55,121,3.81,914,137,707
2,0.341,0.441,0.278,3.79,101,3.81,886,204,807
3,0.321,0.394,0.268,3.37,117,3.46,897,108,655
4,0.330,0.390,0.260,4.28,91,4.13,863,142,658
...,...,...,...,...,...,...,...,...,...
1467,0.337,0.417,0.258,4.80,84,4.87,1346,182,686
1468,0.309,0.364,0.236,5.08,84,4.74,1312,124,570
1469,0.294,0.375,0.232,4.79,92,4.76,1239,167,598
1470,0.304,0.402,0.239,5.84,77,5.15,1234,195,632


#### Split up data

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data_all, target, stratify=target)

X_train, X_test = scale_data(X_train, X_test)

print(f'X_train.shape: {X_train.shape}; y_train.shape: {y_train.shape}')
print(f'X_test.shape: {X_test.shape}; y_test.shape: {y_test.shape}')


X_train.shape: (1104, 9); y_train.shape: (1104,)
X_test.shape: (368, 9); y_test.shape: (368,)


#### Create model

In [5]:
model = LogisticRegression(max_iter=99999999)

model.fit(X_train, y_train)

print(f'Training Accuracy: {model.score(X_train, y_train)*100:.2f}%')
print(f'Testing Accuracy: {model.score(X_test, y_test)*100:.2f}%')    # Slightly better than moneyball sim.

Training Accuracy: 74.73%
Testing Accuracy: 72.83%


#### Check weight importances

In [6]:
weights = model.coef_[0]
features = ['OBP', 'SLG', 'BA', 'ERA', 'ERA+', 'FIP', 'Ks', 'HR', 'RBIs']

features_plus_weights = [(feature, abs(round(weight, 3))) for feature, weight in zip(features, weights)]
features_plus_weights = sorted(features_plus_weights, key=lambda i: i[1])

print('\nWeight importance (sorted least to greatest):')
print(features_plus_weights)


Weight importance (sorted least to greatest):
[('BA', 0.046), ('HR', 0.189), ('SLG', 0.197), ('ERA+', 0.263), ('OBP', 0.296), ('FIP', 0.742), ('Ks', 0.752), ('RBIs', 0.858), ('ERA', 1.321)]
