In [93]:
import pandas as pd 
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import math

In [94]:
def add_win_ratio(dataset_name): #Appending target Win:Game ratio in dataset
  data = pd.read_csv(dataset_name)
  win_ratio = []
  wins = data.loc[:, "W"]
  games_played = data.loc[:, "G"]

  for i in range(len(wins)):
    win_ratio.append(round(wins[i] / games_played[i], 3))

  data['W:G'] = win_ratio
  return data

In [95]:
def remove_WGS(data): #Removing W, G, SEED, POSTSEASON and YEAR
  del data["W"]
  del data["G"]
  del data["SEED"]
  del data["POSTSEASON"]
  del data["YEAR"]
  return data

In [96]:
def weight_PS(data): #Converting postseason rounds to weighted values
  post_season = data.loc[:, "POSTSEASON"]
  for i in range(len(post_season)):
    if (post_season[i] == "NaN"):
      post_season[i] = 0
    if (post_season[i] == "R68"):
      post_season[i] = 1
    if (post_season[i] == "R64"):
      post_season[i] = 2
    if (post_season[i] == "R32"):
      post_season[i] = 3
    if (post_season[i] == "S16"):
      post_season[i] = 4
    if (post_season[i] == "E8"):
      post_season[i] = 5
    if (post_season[i] == "F4"):
      post_season[i] = 6
    if (post_season[i] == "2ND"):
      post_season[i] = 7
    if (post_season[i] == "Champions"):
      post_season[i] = 8
  data.POSTSEASON = post_season
  return data


In [97]:
def forward_selection(data, significance_level=0.01): #Selecting significant features, removing TEAM, CONF and W:G manually
  
  target = data["W:G"].to_numpy()
  features = data.columns.tolist()
  lowest_p_value = 0
  lowest_p_index = -1
  for i in range(len(features)):
    if (features[i] == "TEAM" or features[i] == "CONF" or features[i] == "W:G"):
      continue
    X = data.loc[:, features[i]]
    X = sm.add_constant(X)
    result = sm.OLS(target, X.astype(float)).fit()
    curr_p_value = result.pvalues[1]
    if lowest_p_index == -1 or lowest_p_value > curr_p_value:
        lowest_p_value = curr_p_value
        lowest_p_index = i

  removed_features = [data.loc[:, features.pop(lowest_p_index)].name]
  while (True):
      lowest_p_value = 0
      lowest_p_index = -1
      for i in range(len(features)):
        if (features[i] == "TEAM" or features[i] == "CONF" or features[i] == "W:G"):
          continue
        removed_features.append(features[i])
        X = data.loc[:, removed_features]
        X = sm.add_constant(X)
        result = sm.OLS(target, X.astype(float)).fit()
        removed_features.remove(features[i])
        curr_value = result.pvalues[len(result.pvalues) - 1]
        if lowest_p_index == -1 or lowest_p_value > curr_value:
            lowest_p_value = curr_value
            lowest_p_index = i
    
      if (lowest_p_value >= significance_level):
        return removed_features
      else:
        if (len(features) == 0):
            break
        removed_features.append(features.pop(lowest_p_index))
          
  return removed_features

In [98]:
def standardize_data(data): #In case we want to standardize
  scaler = StandardScaler()
  standardized = scaler.fit_transform(data[forward_selection(data)].to_numpy(), data["W:G"])
  return pd.DataFrame(standardized,columns = forward_selection(data))

In [99]:
def visualization(data):
  features = forward_selection(data)
  x = data["W:G"].to_numpy()
  data = standardize_data(data)
  count = 0
  for feature in features:
    if count >= 5:
      break
    count += 1
    print(feature)
    y = data[feature].to_numpy()
    fig, ax = plt.subplots(figsize = (9, 9))
    ax.scatter(x, y, alpha=0.7, edgecolors="k")
    ax.set_title(f'{feature} VS W:G')
    ax.set_xlabel("W:G")
    ax.set_ylabel(feature)
    b, a = np.polyfit(x, y, deg=1)
    ax.plot(x, a + b * x, color="k", lw=2.5)
    fig.savefig(f'{feature}_W:G')

In [111]:
data = add_win_ratio("cbb.csv")
data = remove_WGS(data)
stand = standardize_data(data)
W_G = data["W:G"]
stand["W:G"] = W_G
fwd_sel = forward_selection(data)
fwd_sel.append("W:G")
data[fwd_sel].to_csv("cleaned_cbb.csv")
visualization(data)

  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


In [None]:
from google.colab import files
for i in range(5):
  files.download("/content/file.zip")