In [2]:
!pip install statsmodels
from statsmodels.formula.api import ols

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler



In [3]:
# Gathering Data

filepath = 'datasets/Teams.csv'
df_teams = pd.read_csv(filepath, header = 0)

In [4]:
# retain relevant features for the model

df_teams_red = df_teams.drop(['yearID', 'lgID', 'teamID', 'franchID',
                              'Rank', 'DivWin', 'WCWin', 'name', 'park', 'BPF',
                              'PPF', 'teamIDBR', 'teamIDlahman45', 'teamIDretro'], axis = 1).copy()
# check for multicollinearity (example: Wins, Losses, G, GHome...)

df_teams_red

Unnamed: 0,divID,G,Ghome,W,L,LgWin,WSWin,R,AB,H,...,SV,IPouts,HA,HRA,BBA,SOA,E,DP,FP,attendance
0,,31,,20,10,N,,401,1372,426,...,3,828,367,2,42,23,243,24,0.834,
1,,28,,19,9,N,,302,1196,323,...,1,753,308,6,28,22,229,16,0.829,
2,,29,,10,19,N,,249,1186,328,...,0,762,346,13,53,34,234,15,0.818,
3,,19,,7,12,N,,137,746,178,...,0,507,261,5,21,17,163,8,0.803,
4,,33,,16,17,N,,302,1404,403,...,0,879,373,7,42,22,235,14,0.840,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2980,C,162,81.0,90,72,N,N,706,5351,1303,...,50,4251,1234,152,608,1225,84,137,0.986,2102530.0
2981,E,162,81.0,100,62,N,N,857,5507,1336,...,42,4367,1264,184,436,1478,80,130,0.986,761072.0
2982,W,162,81.0,60,102,N,N,625,5405,1254,...,31,4273,1402,232,513,1239,83,146,0.986,2110258.0
2983,E,162,80.0,91,71,N,N,846,5476,1455,...,34,4216,1257,209,473,1468,90,122,0.984,805901.0


In [5]:
# Remove data where the division ID does not exist

df_teams_no_NA = df_teams_red[df_teams_red['divID'].isna() == False].copy()
df_teams_no_NA

Unnamed: 0,divID,G,Ghome,W,L,LgWin,WSWin,R,AB,H,...,SV,IPouts,HA,HRA,BBA,SOA,E,DP,FP,attendance
1517,W,162,81.0,93,69,N,N,691,5460,1411,...,42,4335,1334,144,438,893,115,114,0.981,1458320.0
1518,E,162,81.0,109,53,Y,N,779,5518,1465,...,36,4419,1194,117,498,897,101,145,0.984,1062069.0
1519,E,162,81.0,87,75,N,N,743,5494,1381,...,41,4398,1423,155,685,935,157,178,0.975,1833246.0
1520,W,163,81.0,71,91,N,N,528,5316,1221,...,39,4314,1294,126,517,885,135,164,0.978,758388.0
1521,W,162,81.0,68,94,N,N,625,5450,1346,...,25,4311,1470,146,564,810,122,163,0.981,589546.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2980,C,162,81.0,90,72,N,N,706,5351,1303,...,50,4251,1234,152,608,1225,84,137,0.986,2102530.0
2981,E,162,81.0,100,62,N,N,857,5507,1336,...,42,4367,1264,184,436,1478,80,130,0.986,761072.0
2982,W,162,81.0,60,102,N,N,625,5405,1254,...,31,4273,1402,232,513,1239,83,146,0.986,2110258.0
2983,E,162,80.0,91,71,N,N,846,5476,1455,...,34,4216,1257,209,473,1468,90,122,0.984,805901.0


In [6]:
# Unique values of categorical target variable

set(df_teams_no_NA['divID'])

{'C', 'E', 'W'}

In [7]:
# Check for NA values
df_teams_no_NA.isna().sum()

divID          0
G              0
Ghome          0
W              0
L              0
LgWin         28
WSWin         28
R              0
AB             0
H              0
2B             0
3B             0
HR             0
BB             0
SO             0
SB             0
CS             0
HBP           24
SF            24
RA             0
ER             0
ERA            0
CG             0
SHO            0
SV             0
IPouts         0
HA             0
HRA            0
BBA            0
SOA            0
E              0
DP             0
FP             0
attendance     0
dtype: int64

In [8]:
# Train Test Split

X = df_teams_no_NA.drop('divID', axis=1).copy()
y = df_teams_no_NA['divID'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [10]:
# Normalize training data

X_train_scaled = pd.DataFrame()

# save means and standard deviations when given a testing example

column_names = []
X_train_means = []
X_train_standard_deviations = []

for col in X_train.columns:
    column_names.append(col)
    X_train_scaled[col] = (X_train[col] - np.mean(X_train[col])) / np.std(X_train[col])
    means.append(np.mean(X_train[col]))
    standard_deviations.append(np.std(X_train[col]))
X_train_scaled

NameError: name 'means' is not defined

In [95]:
knn = KNeighborsClassifier(n_neighbors=7)
  
knn.fit(X_train_scaled, y_train)
  
# Calculate the accuracy of the model

# Normalize testing data

X_test_scaled = pd.DataFrame()

column_names = []
X_test_means = []
X_test_standard_deviations = []

for col in X_test.columns:
    column_names.append(col)
    X_test_scaled[col] = (X_test[col] - np.mean(X_test[col])) / np.std(X_test[col])
    means.append(np.mean(X_test[col]))
    standard_deviations.append(np.std(X_test[col]))

# Model Score

print(knn.score(X_test_scaled, y_test))

0.4897959183673469


In [136]:
# Predicting the division ID given an input of features

# testing with NaN value - have not done this yet

df_temp = X_test_scaled.iloc[0].to_frame().T
print(df_temp)
print(knn.predict(df_temp))

             G         R       AB         H        HR        CG        HA  \
2709  0.271901  0.117233  0.31642  0.529281 -1.113618 -0.680471 -0.099225   

          HRA  attendance  
2709 -0.17131    1.578372  
['W']


In [4]:

def predict_team_division():
    # KNN
    
    
def predict_player_salary():
    

def predict hit zone () # area ball lands in (home run, left, right, foul, etc...)

IndentationError: expected an indented block after function definition on line 1 (2307934486.py, line 5)