In [17]:
"""
Will be utilizing feature selection on the record data dataset "league_combined_with_chall_cleaned.csv"

Objective: to find the most relevant variables to improve Naive Bayes Classification model
"""

#import packages
import pandas as pd
import numpy as np
import scipy
from sklearn.model_selection import train_test_split
import itertools

In [70]:
#import the record dataset
record_data_path = '../../data/cleaned_riot_data/league_combined_with_chall_cleaned.csv'
combined_league_ranks = pd.read_csv(record_data_path, index_col=None)

In [71]:
"""
Should only have label with record data in the dataframe
"""

#modify the variables a bit to remove time and include it as a rate for other variables
#somewhat a feature extraction method :)
#do this for variables: kills, deaths, dmgObj, dmgTurr, vision_score, totalDmg, totalDmgTaken, totalMinions, gold
combined_league_ranks['kills/min'] = combined_league_ranks['kills']/combined_league_ranks['time']
combined_league_ranks['deaths/min'] = combined_league_ranks['deaths']/combined_league_ranks['time']
combined_league_ranks['dmgObj/min'] = combined_league_ranks['dmgObj']/combined_league_ranks['time']
combined_league_ranks['dmgTurr/min'] = combined_league_ranks['dmgTurr']/combined_league_ranks['time']
combined_league_ranks['vision_score/min'] = combined_league_ranks['vision_score']/combined_league_ranks['time']
combined_league_ranks['dmg/min'] = combined_league_ranks['totalDmg']/combined_league_ranks['time']
combined_league_ranks['dmgTaken/min'] = combined_league_ranks['totalDmgTaken']/combined_league_ranks['time']
combined_league_ranks['minions/min'] = combined_league_ranks['totalMinions']/combined_league_ranks['time']
combined_league_ranks['gold/min'] = combined_league_ranks['gold']/combined_league_ranks['time']


#remove the columns that did not include time as a rate
combined_league_ranks = combined_league_ranks.drop('kills', axis=1)
combined_league_ranks = combined_league_ranks.drop('deaths', axis=1)
combined_league_ranks = combined_league_ranks.drop('dmgObj', axis=1)
combined_league_ranks = combined_league_ranks.drop('dmgTurr', axis=1)
combined_league_ranks = combined_league_ranks.drop('vision_score', axis=1)
combined_league_ranks = combined_league_ranks.drop('totalDmg', axis=1)
combined_league_ranks = combined_league_ranks.drop('totalDmgTaken', axis=1)
combined_league_ranks = combined_league_ranks.drop('totalMinions', axis=1)
combined_league_ranks = combined_league_ranks.drop('gold', axis=1)

#also remove time column as its included in time dependent variables
#also remove position and win column as its not record data
combined_league_ranks = combined_league_ranks.drop('time', axis=1)
combined_league_ranks = combined_league_ranks.drop('position', axis=1)
combined_league_ranks = combined_league_ranks.drop('win', axis=1)

In [72]:
"""
Before feature selecting, we want to first split the original dataset into training, validation, and testing sets with a 80%/10%/10% split.
This is done with scikit-learn's train_test_split function.
This is important so that the model doesn't overfit.
"""

#split combined_league_ranks first with sklearn function into training and validation/testing (80%/20%)
# Use train_test_split function from scikit-learn
# Use random_state = 42 for static seed
combined_train_df, combined_vali_test_df = train_test_split(combined_league_ranks, test_size=0.2, random_state=42)

#split combined_vali_test_df to split into validation and testing sets (50%/50%)
combined_vali_df, combined_test_df = train_test_split(combined_vali_test_df, test_size=0.5, random_state=42)


In [66]:
# adding the merit and maximize merit functions from lab 3.2

def merit(x,y,correlation="pearson"):
    # x=matrix of features
    # y=matrix (or vector) of targets
    # correlation="pearson" or "spearman"

    # INSERT CODE HERE
    k = x.shape[1]
    if correlation == "spearman":
      spearman_correlations = np.array([scipy.stats.spearmanr(x[:, i], y.flatten()).correlation for i in range(x.shape[1])])
      r_xy = np.mean(spearman_correlations)
      correlation_matrix, _ = scipy.stats.spearmanr(x, axis=0)
      r_xx = np.mean(correlation_matrix[~np.eye(correlation_matrix.shape[0], dtype=bool)])
      num = k * r_xy
      denom = np.sqrt(k + k * (k - 1) * r_xx)
      merit = num/denom
      return merit
    else:
      pearson_correlations = np.array([scipy.stats.pearsonr(x[:, i], y.flatten()).correlation for i in range(x.shape[1])])
      r_xy = np.mean(pearson_correlations)
      pearson_correlation_matrix = np.zeros((x.shape[1], x.shape[1]))
      for i in range(x.shape[1]):
          for j in range(x.shape[1]):
              corr, _ = scipy.stats.pearsonr(x[:, i], x[:, j])
              pearson_correlation_matrix[i, j] = corr
      r_xx = np.mean(pearson_correlation_matrix[~np.eye(pearson_correlation_matrix.shape[0], dtype=bool)])
      num = k * r_xy
      denom = np.sqrt(k + k * (k - 1) * r_xx)
      merit = num/denom
      return merit

def maximize_CFS(x,y):
     top_merit = 0
     count = 0
     current_subset = 0
     # INSERT CODE HERE
     list1 = [*range(1,x.shape[1])]; #print(list1)
     for L in range(1,len(list1) + 1):
          for subset in itertools.combinations(list1, L):
               new_x = x[:, [idx - 1 for idx in subset]]
               current_merit = merit(new_x,y,correlation="pearson")
               if current_merit > top_merit:
                    top_merit = current_merit
                    count += 1
                    current_subset = list(subset)
                    print("found new max merit: ", top_merit)
                    print("optimal features = ", current_subset)
                    print("iteration = ", count, current_subset, top_merit)
               else:
                    top_merit
                    count
     return top_merit, count, current_subset

In [73]:
"""
After splitting the dataset into training, validation, and testing I make my target array y_test containing the ranks of each index of match data.
"""

#since there are multiple string labels used for ranks, they should be converted into numericals
#iron = 0, bronze = 1, silver = 2, gold = 3, platinum = 4, emerald = 5, diamond = 6, challenger = 7
y_train = []
for i in range(0,combined_train_df.shape[0]):
    #convert strings to int tags
    #row i and column 0 (rank)
    if (combined_train_df.iloc[i,0] == 'iron'):
        y_train.append(0)
    if (combined_train_df.iloc[i,0] == 'bronze'):
        y_train.append(1)
    if (combined_train_df.iloc[i,0] == 'silver'):
        y_train.append(2)
    if (combined_train_df.iloc[i,0] == 'gold'):
        y_train.append(3)
    if (combined_train_df.iloc[i,0] == 'platinum'):
        y_train.append(4)
    if (combined_train_df.iloc[i,0] == 'emerald'):
        y_train.append(5)
    if (combined_train_df.iloc[i,0] == 'diamond'):
        y_train.append(6)
    if (combined_train_df.iloc[i,0] == 'challenger'):
        y_train.append(7)


In [74]:
"""
Now we want to subset the training dataset and typecast them into arrays to input into custom function called merit to test it out.
"""

#subset combined_league_ranks to include everything but the label column "rank"
combined_train_features = combined_train_df.iloc[:, 1:]
#typecast new features subset into array for merit function
x_train = combined_train_features.values
#also typecast target list into array
y_train = np.array(y_train)

#test merit function on x_record and y_record to find merit of using all features
#compare "spearman" and "pearson" correlations
print(merit(x_train, y_train, correlation='spearman'))
print(merit(x_train, y_train, correlation='pearson'))


0.1811654963201674
0.20199468597917908


In [75]:
"""
Since merit function seems to work, I now will run through the training set features to see which subset of features maximizes merit. 
This will feature select which training set features are most important in correlating with the target array (ranks)
"""

#use the maximize CFS to iterrate through all combinations of features then outputs subset of features that maximize merit
maximize_CFS(x_train, y_train)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


found new max merit:  0.01497226167743923
optimal features =  [1, 2]
iteration =  1 [1, 2] 0.01497226167743923
found new max merit:  0.08760598416913734
optimal features =  [1, 3]
iteration =  2 [1, 3] 0.08760598416913734
found new max merit:  0.15295848815963234
optimal features =  [1, 5]
iteration =  3 [1, 5] 0.15295848815963234
found new max merit:  0.1804032826105836
optimal features =  [3, 5]
iteration =  4 [3, 5] 0.1804032826105836
found new max merit:  0.21731546518230835
optimal features =  [3, 8]
iteration =  5 [3, 8] 0.21731546518230835
found new max merit:  0.25191565673706656
optimal features =  [5, 6]
iteration =  6 [5, 6] 0.25191565673706656
found new max merit:  0.30437172388655104
optimal features =  [5, 8]
iteration =  7 [5, 8] 0.30437172388655104
found new max merit:  0.33380613045134044
optimal features =  [3, 5, 8]
iteration =  8 [3, 5, 8] 0.33380613045134044


(0.33380613045134044, 8, [3, 5, 8])

#### Conclusion part 1
From the "maximize_CSF" function to see which features maximizes the performance metrics, we see that the subset of features numbers [3,5,6] outputs the highest merit. Thus after correlation-based feature selection we can conclude that features 3, 5, and 8 (Objective Damage per minute, vision score per minute, and minions farmed per minute) give the highest merit and thus should be used for Naive Bayes classification.<br>
I also want to test what features are outputted if I were to remove the positions "jungle" and "utility" as they have the more odd values compared to the other positions (top, middle, and bottom lanes) that show more consistent values with each other.

In [76]:
#repeat everything but this time after removing jungle and utility positions
#re-import data to redo the process
combined_league_ranks = pd.read_csv(record_data_path, index_col=None)

#re-clean
combined_league_ranks['kills/min'] = combined_league_ranks['kills']/combined_league_ranks['time']
combined_league_ranks['deaths/min'] = combined_league_ranks['deaths']/combined_league_ranks['time']
combined_league_ranks['dmgObj/min'] = combined_league_ranks['dmgObj']/combined_league_ranks['time']
combined_league_ranks['dmgTurr/min'] = combined_league_ranks['dmgTurr']/combined_league_ranks['time']
combined_league_ranks['vision_score/min'] = combined_league_ranks['vision_score']/combined_league_ranks['time']
combined_league_ranks['dmg/min'] = combined_league_ranks['totalDmg']/combined_league_ranks['time']
combined_league_ranks['dmgTaken/min'] = combined_league_ranks['totalDmgTaken']/combined_league_ranks['time']
combined_league_ranks['minions/min'] = combined_league_ranks['totalMinions']/combined_league_ranks['time']
combined_league_ranks['gold/min'] = combined_league_ranks['gold']/combined_league_ranks['time']

combined_league_ranks = combined_league_ranks.drop('kills', axis=1)
combined_league_ranks = combined_league_ranks.drop('deaths', axis=1)
combined_league_ranks = combined_league_ranks.drop('dmgObj', axis=1)
combined_league_ranks = combined_league_ranks.drop('dmgTurr', axis=1)
combined_league_ranks = combined_league_ranks.drop('vision_score', axis=1)
combined_league_ranks = combined_league_ranks.drop('totalDmg', axis=1)
combined_league_ranks = combined_league_ranks.drop('totalDmgTaken', axis=1)
combined_league_ranks = combined_league_ranks.drop('totalMinions', axis=1)
combined_league_ranks = combined_league_ranks.drop('gold', axis=1)

combined_league_ranks = combined_league_ranks.drop('time', axis=1)
combined_league_ranks = combined_league_ranks.drop('win', axis=1)


#remove jungle and utility positions
combined_league_ranks = combined_league_ranks[combined_league_ranks['position'] != 'jungle']
combined_league_ranks = combined_league_ranks[combined_league_ranks['position'] != 'utility']
#then remove position column
combined_league_ranks = combined_league_ranks.drop('position', axis=1)


#split process
combined_train_df, combined_vali_test_df = train_test_split(combined_league_ranks, test_size=0.2, random_state=42)
#split again for vali and testing
combined_vali_df, combined_test_df = train_test_split(combined_vali_test_df, test_size=0.5, random_state=42)


#remake y_train
#iron = 0, bronze = 1, silver = 2, gold = 3, platinum = 4, emerald = 5, diamond = 6, challenger = 7
y_train = []
for i in range(0,combined_train_df.shape[0]):
    #convert strings to int tags
    #row i and column 0 (rank)
    if (combined_train_df.iloc[i,0] == 'iron'):
        y_train.append(0)
    if (combined_train_df.iloc[i,0] == 'bronze'):
        y_train.append(1)
    if (combined_train_df.iloc[i,0] == 'silver'):
        y_train.append(2)
    if (combined_train_df.iloc[i,0] == 'gold'):
        y_train.append(3)
    if (combined_train_df.iloc[i,0] == 'platinum'):
        y_train.append(4)
    if (combined_train_df.iloc[i,0] == 'emerald'):
        y_train.append(5)
    if (combined_train_df.iloc[i,0] == 'diamond'):
        y_train.append(6)
    if (combined_train_df.iloc[i,0] == 'challenger'):
        y_train.append(7)


#typecast again and get x_train
#subset combined_league_ranks to include everything but the label column "rank"
combined_train_features = combined_train_df.iloc[:, 1:]
#typecast new features subset into array for merit function
x_train = combined_train_features.values
#also typecast target list into array
y_train = np.array(y_train)


#finally feature select again with maximize_CFS function
maximize_CFS(x_train, y_train)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


found new max merit:  0.018211033515251038
optimal features =  [1, 2]
iteration =  1 [1, 2] 0.018211033515251038
found new max merit:  0.04726896631776993
optimal features =  [1, 3]
iteration =  2 [1, 3] 0.04726896631776993
found new max merit:  0.06600750656808484
optimal features =  [1, 4]
iteration =  3 [1, 4] 0.06600750656808484
found new max merit:  0.10733537084682764
optimal features =  [1, 5]
iteration =  4 [1, 5] 0.10733537084682764
found new max merit:  0.2983176267940218
optimal features =  [1, 8]
iteration =  5 [1, 8] 0.2983176267940218
found new max merit:  0.3445614331024441
optimal features =  [2, 8]
iteration =  6 [2, 8] 0.3445614331024441
found new max merit:  0.36711165733632656
optimal features =  [5, 8]
iteration =  7 [5, 8] 0.36711165733632656


(0.36711165733632656, 7, [5, 8])

#### Conclusion part 2
We see that still features 5 and 8 (vision score per minute and minions farmed per minute) are the most defining features in maximizing merit after removing "jungle" and "utility" positions. So I will continue to use those features in Naive Bayes. As for feature 3 (objective damage per minute), I will include it still for Naive Bayes to first examine how well it works in the model when including the "jungle" position.