In [1]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
import numpy as np

# Read in Data

In [2]:
%run "Data Collection and Cleaning.ipynb"

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120 entries, 0 to 119
Data columns (total 54 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   abbreviation                           120 non-null    object 
 1   assists                                120 non-null    int64  
 2   blocks                                 120 non-null    int64  
 3   defensive_rebounds                     120 non-null    int64  
 4   field_goal_attempts                    120 non-null    int64  
 5   field_goal_percentage                  120 non-null    float64
 6   field_goals                            120 non-null    int64  
 7   free_throw_attempts                    120 non-null    int64  
 8   free_throw_percentage                  120 non-null    float64
 9   free_throws                            120 non-null    int64  
 10  games_played                           120 non-null    int64  
 11  minute



In [3]:
#create dataframe with the high correlation columns, and split up into train and test data
all_var_df = df_merged_train_data.copy()




# y_data = all_var_df["W"].values.reshape(-1, 1)
# x_data = all_var_df[["abbreviation","Season","assists", 'defensive_rebounds','field_goal_percentage',\
#         'field_goals','opp_assists', 'opp_blocks','opp_defensive_rebounds', 'opp_field_goal_percentage',\
#         'opp_points','opp_three_point_field_goal_percentage','opp_total_rebounds','opp_two_point_field_goal_percentage',\
#         'points', 'three_point_field_goal_percentage', 'total_rebounds','two_point_field_goal_percentage']]\
#         .set_index(['abbreviation', 'Season'])


# x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, random_state = 42)



In [4]:
relevant_cols = ["abbreviation","Season","assists", 'defensive_rebounds','field_goal_percentage',\
                'field_goals','opp_assists', 'opp_blocks','opp_defensive_rebounds', 'opp_field_goal_percentage',\
                'opp_points','opp_three_point_field_goal_percentage','opp_total_rebounds','opp_two_point_field_goal_percentage',\
                'points', 'three_point_field_goal_percentage', 'total_rebounds','two_point_field_goal_percentage', 'W']

train, test = all_var_df[0:0], all_var_df[0:0] 
for team in all_var_df.abbreviation.unique():
    
    r_season = int(np.random.randint(2017, 2020))
#     print(team, r_season)
    
    temp_train = all_var_df[(all_var_df.Season != r_season) & (all_var_df.abbreviation == team)]
    temp_test = all_var_df[(all_var_df.Season == r_season) & (all_var_df.abbreviation == team)]
    
    train = train.append(temp_train)
    test = test.append(temp_test)
    
train = train.sample(train.shape[0])
test = test.sample(test.shape[0])

x_train = train[relevant_cols].set_index(['abbreviation', 'Season']).drop("W", axis = 1)
y_train = train["W"].values.reshape(-1, 1)

x_test = test[relevant_cols].set_index(['abbreviation', 'Season']).drop("W", axis = 1)
y_test = test["W"].values.reshape(-1, 1)

In [5]:
#Train Linear Rregression Model
regr = LinearRegression().fit(x_train, y_train)

y_predict_train = regr.predict(x_train)
y_predict_test = regr.predict(x_test)

Since sklearn accuracy function is based on exact matches, it is tough to use the accuracy_score function since it will compare a decimal to an integer. So we will test in two different ways. The first test will be by rounding to the nearest integer, and comparing that result using the accuracy score. The seconcd test will be to get the rate of wins within X of the true value, where X is user defined.

Test #1

In [6]:
def rounded_accuracy(y_actual, y_pred):
    round_y_predict = []

    for y in y_pred:
        rounded_y = round(float(y))
        round_y_predict.append(rounded_y)
        
    round_y_predict_arr = np.array(round_y_predict)
    accuracy_sc = accuracy_score(y_actual, round_y_predict_arr)

    return accuracy_sc

In [7]:
train_accuracy = rounded_accuracy(y_train,y_predict_train)
print("Training Accuracy based on rounded: ",train_accuracy)

test_accuracy = rounded_accuracy(y_test,y_predict_test)
print("Test Accuracy based on rounded: ",test_accuracy)

Training Accuracy based on rounded:  0.06666666666666667
Test Accuracy based on rounded:  0.06666666666666667


Test #2

In [8]:
def calc_accuracy_rate(match_bool):
    true_count = 0
    total_matches = len(match_bool)
    for match in match_bool:
        if match:
            true_count += 1

    return true_count / total_matches

In [9]:
#Read linear Regression Model's Accuracy
def get_accuracy(num_wins_window, y_predict, y_actual):
    # if number of wins predicted is within 5 of the actual number of wins 
    # then this will be marked as a "success"
    # (for example if actual number of wins is 65, "correct window" is 60-70)
    # this can easily be adjusted based on the users desired level of accuracy

    round_y_predict = []

    for i,y in enumerate(y_predict):
        wins_diff = abs(y - y_actual[i])
        if wins_diff <= num_wins_window:
            match = True
        else:
            match = False
        round_y_predict.append(match)

        
    # round_y_predict_arr = np.array(round_y_predict)
    train_accuracy = calc_accuracy_rate(round_y_predict)

    return train_accuracy




In [10]:
train_accuracy_0 = get_accuracy(0,y_predict_train,y_train)
print("Exact Accuracy:",train_accuracy_0)

train_accuracy_3 = get_accuracy(3,y_predict_train,y_train)
print("Rate of correct wins within 3:", train_accuracy_3)

train_accuracy_5 = get_accuracy(5,y_predict_train,y_train)
print("Rate of correct wins within 5:", train_accuracy_5)

train_accuracy_10 = get_accuracy(10,y_predict_train,y_train)
print("Rate of correct wins within 10:",train_accuracy_10)


Exact Accuracy: 0.0
Rate of correct wins within 3: 0.2833333333333333
Rate of correct wins within 5: 0.45
Rate of correct wins within 10: 0.7666666666666667


In [11]:
test_accuracy_0 = get_accuracy(0,y_predict_test,y_test)
print("Exact Accuracy:",test_accuracy_0)

test_accuracy_1 = get_accuracy(1,y_predict_test,y_test)
print("Rate of correct wins within 1:", test_accuracy_1)

test_accuracy_3 = get_accuracy(3,y_predict_test,y_test)
print("Rate of correct wins within 3:", test_accuracy_3)

test_accuracy_5 = get_accuracy(5,y_predict_test,y_test)
print("Rate of correct wins within 5:", test_accuracy_5)

test_accuracy_10 = get_accuracy(10, y_predict_test,y_test)
print("Rate of correct wins within 10:", test_accuracy_10)

Exact Accuracy: 0.0
Rate of correct wins within 1: 0.06666666666666667
Rate of correct wins within 3: 0.3
Rate of correct wins within 5: 0.4
Rate of correct wins within 10: 0.7333333333333333
