In [72]:
!pip install statsmodels
from statsmodels.formula.api import ols

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler



In [88]:
# Gathering Data

filepath = 'datasets/Teams.csv'
df_teams = pd.read_csv(filepath, header = 0)

In [89]:
df_teams_red = df_teams[['divID', 'G', 'R', 'AB', 'H', 'HR', 'CG', 'HA', 'HRA', 'attendance']].copy()
df_teams_red

Unnamed: 0,divID,G,R,AB,H,HR,CG,HA,HRA,attendance
0,,31,401,1372,426,3,22,367,2,
1,,28,302,1196,323,10,25,308,6,
2,,29,249,1186,328,7,23,346,13,
3,,19,137,746,178,2,19,261,5,
4,,33,302,1404,403,1,32,373,7,
...,...,...,...,...,...,...,...,...,...,...
2980,C,162,706,5351,1303,198,3,1234,152,2102530.0
2981,E,162,857,5507,1336,222,1,1264,184,761072.0
2982,W,162,625,5405,1254,167,0,1402,232,2110258.0
2983,E,162,846,5476,1455,262,1,1257,209,805901.0


In [90]:
# Remove data where the division ID does not exist

df_teams_no_NA = df_teams_red[df_teams_red['divID'].isna() == False].copy()
df_teams_no_NA

Unnamed: 0,divID,G,R,AB,H,HR,CG,HA,HRA,attendance
1517,W,162,691,5460,1411,141,38,1334,144,1458320.0
1518,E,162,779,5518,1465,175,50,1194,117,1062069.0
1519,E,162,743,5494,1381,197,30,1423,155,1833246.0
1520,W,163,528,5316,1221,88,25,1294,126,758388.0
1521,W,162,625,5450,1346,112,29,1470,146,589546.0
...,...,...,...,...,...,...,...,...,...,...
2980,C,162,706,5351,1303,198,3,1234,152,2102530.0
2981,E,162,857,5507,1336,222,1,1264,184,761072.0
2982,W,162,625,5405,1254,167,0,1402,232,2110258.0
2983,E,162,846,5476,1455,262,1,1257,209,805901.0


In [91]:
# Unique values of categorical target variable

set(df_teams_no_NA['divID'])

{'C', 'E', 'W'}

In [92]:
df_teams_no_NA.isna().sum() # Check for NA values (none were found)

divID         0
G             0
R             0
AB            0
H             0
HR            0
CG            0
HA            0
HRA           0
attendance    0
dtype: int64

In [93]:
# Train Test Split

X = df_teams_no_NA.drop('divID', axis=1).copy()
y = df_teams_no_NA['divID'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [94]:
# Normalize training data

X_train_scaled = pd.DataFrame()

# save means and standard deviations when given a testing example

column_names = []
X_train_means = []
X_train_standard_deviations = []

for col in X_train.columns:
    column_names.append(col)
    X_train_scaled[col] = (X_train[col] - np.mean(X_train[col])) / np.std(X_train[col])
    means.append(np.mean(X_train[col]))
    standard_deviations.append(np.std(X_train[col]))
X_train_scaled

Unnamed: 0,G,R,AB,H,HR,CG,HA,HRA,attendance
2171,-2.734754,-1.752216,-2.643924,-2.520513,-0.725227,-0.565867,-1.872072,-0.841958,-1.009276
2855,0.260324,-0.114559,0.271307,-0.069693,0.560083,-0.941370,0.435988,1.847445,-0.100964
1771,0.260324,1.318390,0.477788,1.191065,-0.747776,1.624569,0.470436,0.399305,0.328553
2587,0.260324,-0.586276,0.329052,-0.022562,0.109097,-0.816202,0.407280,0.399305,0.528183
2797,0.260324,-0.470572,0.134820,-0.411394,-0.026199,-0.941370,-0.138157,0.554463,0.566988
...,...,...,...,...,...,...,...,...,...
2647,0.260324,-0.337067,0.094573,-0.328914,-0.386987,-0.878786,-0.517092,-0.273046,0.171837
2811,0.260324,0.116849,0.187315,0.054026,0.334590,-0.878786,0.355608,2.804252,-0.114541
2377,0.260324,-0.408269,0.031579,-0.358371,0.266942,-0.503283,0.562299,0.451024,0.573166
2976,0.260324,-0.835484,-0.076911,-0.782552,-0.567382,-1.003954,0.028345,1.640568,-1.361058


In [95]:
knn = KNeighborsClassifier(n_neighbors=7)
  
knn.fit(X_train_scaled, y_train)
  
# Calculate the accuracy of the model

# Normalize testing data

X_test_scaled = pd.DataFrame()

column_names = []
X_test_means = []
X_test_standard_deviations = []

for col in X_test.columns:
    column_names.append(col)
    X_test_scaled[col] = (X_test[col] - np.mean(X_test[col])) / np.std(X_test[col])
    means.append(np.mean(X_test[col]))
    standard_deviations.append(np.std(X_test[col]))

# Model Score

print(knn.score(X_test_scaled, y_test))

0.4897959183673469


In [136]:
# Predicting the division ID given an input of features

# testing with NaN value

df_temp = X_test_scaled.iloc[0].to_frame().T
print(df_temp)
print(knn.predict(df_temp))

             G         R       AB         H        HR        CG        HA  \
2709  0.271901  0.117233  0.31642  0.529281 -1.113618 -0.680471 -0.099225   

          HRA  attendance  
2709 -0.17131    1.578372  
['W']


In [4]:

def predict_team_division():
    # KNN
    
    
def predict_player_salary():
    


IndentationError: expected an indented block after function definition on line 1 (2307934486.py, line 5)