# Monster ranking system
## A Hunter College machine learning course project
---

## §0 Preliminaries

### 0.1 Import libraries

In [1]:
# It is nice to remove the deprecation warnings.
# They really distract from the important output!
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# Can't start without Python's triumvirate...
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Grab some nice utilities.
import functools, itertools, operator

# For the useful visualizations of the feature set.
import seaborn as sns

# The domain specific dependencies.
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.preprocessing import LabelBinarizer
from sklearn import metrics
from scipy import stats

# A random seed, but fixed seed to use in randomized function calls.
STATIC_SEED = 0xf90b36c2

### 0.2  Define some useful functions

In [2]:
###
# Define the utility function I wrote as "My Library"
###


# Given a Pandas data frame, partition the data frame into two segements.
# The first segment contains all but the last column.
# The second segment contains only the last column.
# The partitioned data frame represents the feature observation matrix
def seperate_data(data_frame):
    # Shuffle the input data to ensure 
    # there are no ordering biases
    data_frame  = data_frame.sample(frac=1, random_state=STATIC_SEED)
    labelColumn = data_frame.columns[-1]
    X = data_frame.loc[:, data_frame.columns != labelColumn]
    Y = data_frame[labelColumn]
    return X, Y

# Takes a feature observation matrix and a label vector along with
# a specification for the relative sizes of the requested partitions.
# Returns the inputs partitioned into 4 sets, respectively:
#   - Full Training
#   - Partial Training
#   - Validation
#   - Testing
#
# The partitions have the following relationships:
#   - Full Training ∪ Testing = Input
#   - Full Training = Partial Training ∪ Validation
#
# Intended to be convieient for model selection and tuning.
def train_valid_test(X_in, Y_in, validation_size, test_size):
    splitter = lambda x, y, n: train_test_split(x, y, test_size=n, random_state=STATIC_SEED)
    X_full,  X_test,  Y_full,  Y_test  = splitter(X_in, Y_in, test_size)
    X_train, X_valid, Y_train, Y_valid = splitter(X_full, Y_full, validation_size)
    return X_full, X_train, X_valid, X_test, Y_full, Y_train, Y_valid, Y_test

# Define a reusable descriptor for data sets.
# Nicely renders the dimensions of the provided data set.
def describe_data_set(X, label):
    rStr = str(X.shape[0])
    cStr = str(X.shape[1])
    mLen = max(len(rStr),len(cStr))
    print(label)
    print(" ",rStr.rjust(mLen), "observations")
    print(" ",cStr.rjust(mLen), "features")

# I defined this function to perform binary classification using
# the computed threshhold value from ROC curve analysis.
# However it performs worse than the default "predict" function.
# It remains here for reference.
def predict_with_threshold(clf, X, t):
    # First we define a transformation;
    # taking a classification probability and
    # returning a binary classification
    # based on the supplied optimal threshold.
    exceeds_threshold = lambda x: 1 if (max(x) - min(x)) >= t else 0

    # Prediction probabilities of the test data set.
    Y = clf.predict_proba(X)

    # Binary classifiactions
    return np.fromiter(map(exceeds_threshold, Y), np.int32)

# Nicely render a confusion matrix.
# Much better than looking at directly at the numpy array.
def inspect_confusion_matrix(Y_true, Y_pred):
    matrix = metrics.confusion_matrix(Y_true, Y_pred)
    maxVal = max(np.concatenate(matrix).flat, key=lambda x: x)
    padLen = len(str(maxVal))
    
    print("Confusion matrix:")
    for row in matrix:
        print("  ", sep='', end='')
        sum = 0
        for col in row:
            sum += col
            print(str(col).rjust(padLen), " ", sep='', end='')
        print("|=", sum)
                
# We tune the model by determining which hyperparamaters perform best.
def model_selection(classifier, param_grid, X_train_part, Y_train_part):
    result_classifier = GridSearchCV(classifier, param_grid, scoring='accuracy', cv=4, verbose=1, n_jobs=-1)
    result_classifier.fit(X_train_part, Y_train_part)
    best_hyperparameters = result_classifier.best_params_
    print("Best accuracy score found:  ", round(result_classifier.best_score_,4))
    return best_hyperparameters


def dropColumn(df, colName):
    if colName in df.columns:
        df.drop(colName, 1, inplace=True)


def setType(df, colName, colType):
    df[colName] = df[colName].astype(colType)
        

def oneHotBitEncodeColumn(df, colName, prefix=None):
    if colName not in df.columns:
        return
    spot = df.columns.get_loc(colName)
    cols = pd.get_dummies(df[colName], prefix)
    ordered = sorted(cols.columns)
    ordered.reverse()
    for name in ordered:
        df.insert(spot, name, cols[name])
    df.drop(colName, 1, inplace=True)


def inclusionBitEncodeColumn(df, colName, prefix=None):
    if colName not in df.columns:
        return
    values  = pd.DataFrame(df[colName].values.tolist()).stack().values
    uniques = functools.reduce(lambda a,b: set(b.split(',')).union(a), values, set())
    colSpot = df.columns.get_loc(colName)
    ordered = sorted(uniques)
    ordered.reverse()
    for val in ordered:
        col = df[colName].map(lambda x: val in set(str(x).split(',')))
        pref = colName
        if prefix is not None:
            pref = prefix
        df.insert(colSpot, pref + "_" + val, col.astype(np.uint8))
    df.drop(colName, 1, inplace=True)

### 0.3  Load data.

In [3]:
item_data = pd.read_csv('data/dnd-5e-monsters.csv',sep=',')

### 0.4 Do a quick check of the data and print out the following information about the data.

### 0.4.1 Get dimension of data

In [4]:
describe_data_set(item_data,'Raw Data')

Raw Data
  1386 observations
    72 features


### 0.4.2 Look at top 5 rows

In [5]:
item_data.head()

Unnamed: 0,Name,Type,Size,Armor,Hit Points,Move Burrow,Move Climb,Move Fly,Move Swim,Move Walk,...,Cause Prone,Cause Restrained,Cause Stunned,Cause Unconscious,Multiattack,Spellcasting,Damage Tags,Spellcasting Tags,Trait Tags,Elo Rank
0,Ahmaergo,humanoid,2,18,143,0,0,0,0,25,...,0,0,0,0,1,0,"P,S",,,2762
1,Ammalia Cassalanter,humanoid,2,15,45,0,0,0,0,30,...,0,0,0,0,0,1,B,CW,,1957
2,Awakened Rat,beast,0,10,1,0,0,0,0,20,...,0,0,0,0,0,0,P,,Keen Senses,-1625
3,Barnibus Blastwind,humanoid,2,13,24,0,0,0,0,30,...,0,0,0,0,0,1,P,CW,,-468
4,Bepis Honeymaker,humanoid,1,10,4,0,0,0,0,25,...,0,0,0,0,0,0,,,,-1710


### 0.4.3  Get a description of the measurements

In [6]:
item_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1386 entries, 0 to 1385
Data columns (total 72 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Name                 1386 non-null   object
 1   Type                 1386 non-null   object
 2   Size                 1386 non-null   int64 
 3   Armor                1386 non-null   int64 
 4   Hit Points           1386 non-null   int64 
 5   Move Burrow          1386 non-null   int64 
 6   Move Climb           1386 non-null   int64 
 7   Move Fly             1386 non-null   int64 
 8   Move Swim            1386 non-null   int64 
 9   Move Walk            1386 non-null   int64 
 10  Stat Str             1386 non-null   int64 
 11  Stat Dex             1386 non-null   int64 
 12  Stat Con             1386 non-null   int64 
 13  Stat Int             1386 non-null   int64 
 14  Stat Wis             1386 non-null   int64 
 15  Stat Cha             1386 non-null   int64 
 16  Save S

### 0.4.4 Get the class distribution.

In [7]:
item_data.describe()

Unnamed: 0,Size,Armor,Hit Points,Move Burrow,Move Climb,Move Fly,Move Swim,Move Walk,Stat Str,Stat Dex,...,Cause Paralyzed,Cause Petrified,Cause Poisoned,Cause Prone,Cause Restrained,Cause Stunned,Cause Unconscious,Multiattack,Spellcasting,Elo Rank
count,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,...,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0
mean,2.258297,14.559163,79.018759,0.551948,2.489177,9.238817,4.61039,29.336219,14.575758,13.352092,...,0.076479,0.013709,0.070707,0.123377,0.108225,0.051227,0.038961,0.551948,0.311688,1464.601732
std,0.968653,2.902961,80.71099,3.889518,8.358499,22.744412,13.543342,11.574256,5.771119,3.370522,...,0.265859,0.11632,0.256427,0.328988,0.310777,0.220539,0.193572,0.497474,0.46335,1471.981762
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1903.0
25%,2.0,13.0,22.0,0.0,0.0,0.0,0.0,30.0,11.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,373.25
50%,2.0,15.0,52.0,0.0,0.0,0.0,0.0,30.0,15.0,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1427.0
75%,3.0,16.0,110.0,0.0,0.0,0.0,0.0,30.0,18.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2445.75
max,5.0,25.0,676.0,50.0,40.0,150.0,120.0,120.0,30.0,28.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6147.0


## §1 Feature Extraction & Selection

### 1.1 Generate a bar plot to display the class distribution.

In [8]:
%%time

skip = True

# Calculate correlation coefficient
def corrfunc(x, y, **kws):
    r, _ = stats.pearsonr(x, y)
    ax = plt.gca()
    ax.annotate("r = {:.2f}".format(r),
                xy=(.1, .6), xycoords=ax.transAxes,
               size = 24)

if not skip:
    cmap = sns.cubehelix_palette(light=1, dark = 0.1, hue = 0.5, as_cmap=True)
    sns.set_context(font_scale=2)
    # Pair grid set up
    g = sns.PairGrid(item_data)
    # Scatter plot on the upper triangle
    g.map_upper(plt.scatter, s=10, color = 'red')
    # Distribution on the diagonal
    g.map_diag(sns.displot, kde=False, color = 'red')
    # Density Plot and Correlation coefficients on the lower triangle
    g.map_lower(sns.kdeplot, cmap = cmap)
    g.map_lower(corrfunc);

CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 5.25 µs


### 1.5 Transform 'Weapon Properties' measurment

In [9]:
inclusionBitEncodeColumn(item_data, 'weaponProperty')

## §2 Feature Selection

### 2.1 Drop null dominated columns

In [10]:
# These columns are are over 99.9% null
#dropColumn(item_data, 'conditionImmunities')
#dropColumn(item_data, 'damageImmunities'   )
#dropColumn(item_data, 'damage1Type_O'      )
#dropColumn(item_data, 'damage1Type_N'      )
#dropColumn(item_data, 'damage1Type_R'      )

### 2.2 Drop string labeling columns

In [11]:
dropColumn(item_data, 'name'          )
#dropColumn(item_data, 'baseItem'      )
#dropColumn(item_data, 'itemType'      )
#dropColumn(item_data, 'attachedSpells')

## §3 Data Preparation

### 3.1 Impute data

In [12]:
nanCols = {'armorClassFixed', 'damage1', 'damage2', 'range_normal', 'range_long'}

for col in nanCols:
    item_data[col] = item_data[col].fillna(0)


KeyError: 'range_long'

### 3.2 Reduce bit widths

In [None]:
ubits16 =   { ''
            , ''
            }

for col in ubits8:
    setType(item_data, col, np.uint8)

for col in ubits16:
    setType(item_data, col, np.uint16)

In [None]:
item_data.info()

### 3.1 Create Train and Test Dataset.

In [None]:
# Split off the last column as the label vector.
# NOTE: See implementation of 'seperate_data' in first python cell.
X, Y = seperate_data(item_data)

### 3.2 Create Train and Test Dataset.

In [None]:
# Partition out data into training, validation and testing sets.
# NOTE: See implementation of 'train_valid_test' in first python cell.
X_train_full, X_train_part, X_valid, X_test, Y_train_full, Y_train_part, Y_valid, Y_test = train_valid_test(X, Y, 0.2, 0.2)

## §4 Naive Bayes

### 4.1 Multinomial Naive Bayes

In [None]:
%%time

# Define a reusable descriptor of a data set
def describe_data_set(X, label):
    rStr = str(X.shape[0])
    cStr = str(X.shape[1])
    mLen = max(len(rStr),len(cStr))
    print(label)
    print(" ",rStr.rjust(mLen), "observations")
    print(" ",cStr.rjust(mLen), "features")

# First, we want to determine the best hyperparameters.
# To do so we generate a list of potential values.
param_grid =    { 'alpha'     : [10**(i - 4) for i in range(0,9)]
                , 'fit_prior' : [False, True]
                }

best_hyperparameters =  { 'alpha': 0.0001
                        , 'fit_prior': True
                        }

# If we don't already have best parameters...
# Let's go find them!
if best_hyperparameters == None:
    best_hyperparameters = model_selection(MultinomialNB(), param_grid, X_train_part, Y_train_part)

classifier_NBM = MultinomialNB(**best_hyperparameters)
Y_score = classifier_NBM.fit(X_train_part, Y_train_part) 

print("Built the Multinomial Naïve Bayes model")
print("Using hyperparameters:")
for k,v in best_hyperparameters.items():
    print(" ",k,"=",v)
describe_data_set(X_train_part, "Partial training set containing:")
print()

In [None]:
import dataset_transforms as datum                                                                                                                                                                                                                                                                                            
import numpy              as np                                                                                                                                                                                                                                                                                               
import matplotlib.pyplot as plt                                                                                                                                                                                                                                                                                               
import seaborn as sn                                                                                                                                                                                                                                                                                                          
                                                                                                                                                                                                                                                                                                                              
                                                                                                                                                                                                                                                                                                                              
                                                                                                                                                                                                                                                                                                                              
# The domain specific dependencies.                                                                                                                                                                                                                                                                                           
from sklearn.naive_bayes import MultinomialNB                                                                                                                                                                                                                                                                                 
#from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict                                                                                                                                                                                                                       
from sklearn.preprocessing import LabelBinarizer                                                                                                                                                                                                                                                                              
from sklearn import metrics                                                                                                                                                                                                                                                                                                   
from scipy   import stats                                                                                                                                                                                                                                                                                                     
                                                                                                                                                                                                                                                                                                                              
                                                                                                                                                                                                                                                                                                                              
monster_data = datum.retreive_monster_dataset(tagged_damage=True, tagged_trait=True, standardized_label_classes=5)                                                                                                                                                                                                            
                                                                                                                                                                                                                                                                                                                              
#print(monster_data.iloc[: ,20:40].describe())                                                                                                                                                                                                                                                                                 
                                                                                                                                                                                                                                                                                                                            