In [211]:
import pandas as pd
import sklearn as sk
import plotly
import plotly.graph_objects as go
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [260]:
all_stars = pd.read_csv('All-Star Selections.csv')
statistics = pd.read_csv("Player Per Game.csv")
statistics.drop_duplicates(inplace=True, subset=["player", "season"])
def knn_classify(number_of_neighbors, training_year, test_year, feature_1, feature_2, feature_3):
    training_year_all_stars = (all_stars[all_stars["season"] == training_year]["player"]).tolist()
    test_year_all_stars = (all_stars[all_stars["season"] == test_year]["player"]).tolist()
    
    training_year_all_star_stats = statistics[statistics["season"] == training_year].loc[statistics[statistics["season"] == training_year]["player"].isin(training_year_all_stars)][["season", "player", feature_1, feature_2, feature_3]]
    training_year_non_all_star_stats = statistics[statistics["season"] == training_year].loc[~statistics[statistics["season"] == training_year]["player"].isin(training_year_all_stars)][["season", "player", feature_1, feature_2, feature_3]]
    test_year_all_star_stats = statistics[statistics["season"] == test_year].loc[statistics[statistics["season"] == test_year]["player"].isin(test_year_all_stars)][["season", "player", feature_1, feature_2, feature_3]]
    test_year_non_all_star_stats = statistics[statistics["season"] == test_year].loc[~statistics[statistics["season"] == test_year]["player"].isin(test_year_all_stars)][["season", "player", feature_1, feature_2, feature_3]]
    
    fig = go.Figure()
    fig.add_trace(go.Scatter3d(x=training_year_all_star_stats[feature_1], y=training_year_all_star_stats[feature_2], z=training_year_all_star_stats[feature_3],
                                       mode='markers', text=training_year_all_star_stats['player'], name='all-stars'))
    fig.add_trace(go.Scatter3d(x=training_year_non_all_star_stats[feature_1], y=training_year_non_all_star_stats[feature_2], z=training_year_non_all_star_stats[feature_3],
                                       mode='markers', text=training_year_non_all_star_stats['player'], name='non-all-stars'))
    fig.update_layout()
    fig.layout.title.text = str(feature_1 + " vs. " + feature_2 + " vs. " + feature_3)
    fig.show()
    print("x: " + feature_1)
    print("y: " + feature_2)
    print("z: " + feature_3)
    print("\n")
    
    tagged_training_year_statistics = statistics[statistics["season"] == training_year][["season", "player", feature_1, feature_2, feature_3]]
    tagged_training_year_statistics['all_star'] = tagged_training_year_statistics['player'].apply(lambda x: 1 if x in training_year_all_stars else 0)
    tagged_test_year_statistics = statistics[statistics["season"] == test_year][["season", "player", feature_1, feature_2, feature_3]]
    tagged_test_year_statistics['all_star'] = tagged_test_year_statistics['player'].apply(lambda x: 1 if x in test_year_all_stars else 0)
    
    X_train = list(zip(tagged_training_year_statistics[feature_1], tagged_training_year_statistics[feature_2], tagged_training_year_statistics[feature_3]))
    y_train = tagged_training_year_statistics['all_star']
    
    X_test = list(zip(tagged_test_year_statistics[feature_1], tagged_test_year_statistics[feature_2], tagged_test_year_statistics[feature_3]))
    y_test = tagged_test_year_statistics['all_star']
    
    knn = KNeighborsClassifier(n_neighbors=number_of_neighbors)
    knn.fit(X_train, y_train)
    
    y_pred = knn.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    
    return (accuracy_score(y_test, y_pred), recall_score(y_test, y_pred))
    

In [261]:
knn_classify(5, 1992, 1993, "pts_per_game", "ast_per_game", "trb_per_game")

x: pts_per_game
y: ast_per_game
z: trb_per_game


Accuracy: 0.9615384615384616
Precision: 0.8666666666666667
Recall: 0.5


(0.9615384615384616, 0.8666666666666667)

In [259]:
all_stars = pd.read_csv('All-Star Selections.csv')
statistics = pd.read_csv("Player Per Game.csv")
statistics.drop_duplicates(inplace=True, subset=["player", "season"])
def logreg_classify(training_year, test_year, feature_1, feature_2, feature_3):
    training_year_all_stars = (all_stars[all_stars["season"] == training_year]["player"]).tolist()
    test_year_all_stars = (all_stars[all_stars["season"] == test_year]["player"]).tolist()
    
    training_year_all_star_stats = statistics[statistics["season"] == training_year].loc[statistics[statistics["season"] == training_year]["player"].isin(training_year_all_stars)][["season", "player", feature_1, feature_2, feature_3]]
    training_year_non_all_star_stats = statistics[statistics["season"] == training_year].loc[~statistics[statistics["season"] == training_year]["player"].isin(training_year_all_stars)][["season", "player", feature_1, feature_2, feature_3]]
    test_year_all_star_stats = statistics[statistics["season"] == test_year].loc[statistics[statistics["season"] == test_year]["player"].isin(test_year_all_stars)][["season", "player", feature_1, feature_2, feature_3]]
    test_year_non_all_star_stats = statistics[statistics["season"] == test_year].loc[~statistics[statistics["season"] == test_year]["player"].isin(test_year_all_stars)][["season", "player", feature_1, feature_2, feature_3]]
    
    fig = go.Figure()
    fig.add_trace(go.Scatter3d(x=training_year_all_star_stats[feature_1], y=training_year_all_star_stats[feature_2], z=training_year_all_star_stats[feature_3],
                                       mode='markers', text=training_year_all_star_stats['player'], name='all-stars'))
    fig.add_trace(go.Scatter3d(x=training_year_non_all_star_stats[feature_1], y=training_year_non_all_star_stats[feature_2], z=training_year_non_all_star_stats[feature_3],
                                       mode='markers', text=training_year_non_all_star_stats['player'], name='non-all-stars'))
    fig.update_layout()
    fig.layout.title.text = str(feature_1 + " vs. " + feature_2 + " vs. " + feature_3)
    fig.show()
    print("x: " + feature_1)
    print("y: " + feature_2)
    print("z: " + feature_3)
    print("\n")
    
    tagged_training_year_statistics = statistics[statistics["season"] == training_year][["season", "player", feature_1, feature_2, feature_3]]
    tagged_training_year_statistics['all_star'] = tagged_training_year_statistics['player'].apply(lambda x: 1 if x in training_year_all_stars else 0)
    tagged_test_year_statistics = statistics[statistics["season"] == test_year][["season", "player", feature_1, feature_2, feature_3]]
    tagged_test_year_statistics['all_star'] = tagged_test_year_statistics['player'].apply(lambda x: 1 if x in test_year_all_stars else 0)
    
    X_train = list(zip(tagged_training_year_statistics[feature_1], tagged_training_year_statistics[feature_2], tagged_training_year_statistics[feature_3]))
    y_train = tagged_training_year_statistics['all_star']
    
    X_test = list(zip(tagged_test_year_statistics[feature_1], tagged_test_year_statistics[feature_2], tagged_test_year_statistics[feature_3]))
    y_test = tagged_test_year_statistics['all_star']
    
    logreg = LogisticRegression(solver="saga", max_iter=10000)
    logreg.fit(X_train, y_train)
    
    y_pred = logreg.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    
    return (accuracy_score(y_test, y_pred), recall_score(y_test, y_pred))


In [257]:
logreg_classify(1992, 1993, "pts_per_game", "ast_per_game", "trb_per_game")

x: pts_per_game
y: ast_per_game
z: trb_per_game


Accuracy: 0.9666666666666667
Precision: 0.8823529411764706
Recall: 0.5769230769230769


allstar_selections_data = pd.read_csv('All-Star Selections.csv')
allstar_2022 = allstar_selections_data[allstar_selections_data["season"] == 2022]
allstar_2022_list = allstar_2022["player"].tolist()
allstar_2023 = allstar_selections_data[allstar_selections_data["season"] == 2023]
allstar_2023_list = allstar_2023["player"].tolist()

In [9]:
# loading player statistics
player_per_game_data = pd.read_csv("Player Per Game.csv")
allstar_2022_player_data = player_per_game_data[player_per_game_data["season"] == 2022]
not_allstar_2022_player_data = allstar_2022_player_data.loc[~allstar_2022_player_data["player"].isin(allstar_2022_list)]
allstar_2022_player_data = allstar_2022_player_data.loc[allstar_2022_player_data["player"].isin(allstar_2022_list)]
allstar_2022_player_data = allstar_2022_player_data[["season", "player", "pos", "pts_per_game", "ast_per_game", "trb_per_game"]]


In [14]:
allstar_2022_player_data

Unnamed: 0,season,player,pos,pts_per_game,ast_per_game,trb_per_game
706,2022,Andrew Wiggins,SF,17.2,2.2,4.5
792,2022,Chris Paul,PG,14.7,10.8,4.4
836,2022,Darius Garland,PG,21.7,8.6,3.3
860,2022,Dejounte Murray,PG,21.1,9.2,8.3
862,2022,DeMar DeRozan,PF,27.9,4.9,5.2
882,2022,Devin Booker,SG,26.8,4.8,5.0
896,2022,Donovan Mitchell,SG,25.9,5.3,4.2
902,2022,Draymond Green,PF,7.5,7.0,7.3
931,2022,Fred VanVleet,PG,20.3,6.7,4.4
950,2022,Giannis Antetokounmpo,PF,29.9,5.8,11.6


In [6]:
allstar_2023_player_data = player_per_game_data[player_per_game_data["season"] == 2023]
not_allstar_2023_player_data = allstar_2023_player_data.loc[~allstar_2023_player_data["player"].isin(allstar_2023_list)]
allstar_2023_player_data = allstar_2023_player_data.loc[allstar_2023_player_data["player"].isin(allstar_2023_list)]
allstar_2023_player_data = allstar_2023_player_data[["season", "player", "pos", "pts_per_game", "ast_per_game", "trb_per_game"]]

In [8]:
tagged_2022_data = player_per_game_data[player_per_game_data["season"] == 2022]
tagged_2022_data = tagged_2022_data.drop_duplicates(subset=['player'], keep='first')
tagged_2022_data = tagged_2022_data[["season", "player", "pos", "pts_per_game", "ast_per_game", "trb_per_game"]]
tagged_2022_data['all-star'] = tagged_2022_data['player'].apply(lambda x: 1 if x in allstar_2022_list else 0)
tagged_2023_data = player_per_game_data[player_per_game_data["season"] == 2022]
tagged_2023_data = tagged_2023_data[["season", "player", "pos", "pts_per_game", "ast_per_game", "trb_per_game"]]
tagged_2023_data = tagged_2023_data.drop_duplicates(subset=['player'], keep='first')
tagged_2023_data['all-star'] = tagged_2023_data['player'].apply(lambda x: 1 if x in allstar_2023_list else 0)
x = tagged_2022_data['pts_per_game']
y = tagged_2022_data['ast_per_game']
z = tagged_2022_data['trb_per_game']
X_train = list(zip(x,y,z))
y_train = tagged_2022_data['all-star']

In [9]:
new_x = tagged_2023_data['pts_per_game'].tolist()
new_y = tagged_2023_data['ast_per_game'].tolist()
new_z = tagged_2023_data['trb_per_game'].tolist()
actual_allstars_2023 = tagged_2023_data['all-star'].tolist()
actual_allstars_names_2023 = tagged_2023_data['player'].tolist()
X_test = list(zip(new_x, new_y, new_z))

In [70]:
log_reg = LogisticRegression(solver="lbfgs")
log_reg.fit(X_train, y_train)

LogisticRegression()

In [67]:
predictions_2022 = log_reg.predict(X_train)
predictions_2023 = log_reg.predict(X_test)

In [68]:
total_allstar = len(allstar_2023_list)
total_non_allstar = len(tagged_2023_data) - total_allstar
true_allstar = len([(actual_allstars_names_2023[i], predictions_2023[i]) for i in range(len(predictions_2023)) if (predictions_2023[i] == actual_allstars_2023[i]) and predictions_2023[i] == 1])
false_allstar = len([(actual_allstars_names_2023[i], predictions_2023[i]) for i in range(len(predictions_2023)) if (predictions_2023[i] != actual_allstars_2023[i]) and predictions_2023[i] == 1])
true_non_allstar = len([(actual_allstars_names_2023[i], predictions_2023[i]) for i in range(len(predictions_2023)) if (predictions_2023[i] == actual_allstars_2023[i]) and predictions_2023[i] == 0])
false_non_allstar = len([(actual_allstars_names_2023[i], predictions_2023[i]) for i in range(len(predictions_2023)) if (predictions_2023[i] != actual_allstars_2023[i]) and predictions_2023[i] == 0])

In [69]:
print("Total Accuracy for 2023 all-stars", round(true_allstar/total_allstar, 2))

Total Accuracy for 2023 all-stars 0.59
