# Magic Item Ranking Wizard
## A Hunter College machine learning course project
---

## §0 Preliminaries

### 0.1 Import libraries

In [1]:
# It is nice to remove the deprecation warnings.
# They really distract from the important output!
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# Can't start without Python's triumvirate...
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Grab some nice utilities.
import functools, itertools, operator

# For the useful visualizations of the feature set.
import seaborn as sns

# The domain specific dependencies.
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.preprocessing import LabelBinarizer
from sklearn import metrics
from scipy import stats

# A random seed, but fixed seed to use in randomized function calls.
STATIC_SEED = 0xf90b36c2

### 0.2  Define some useful functions

In [2]:
###
# Define the utility function I wrote as "My Library"
###


# Given a Pandas data frame, partition the data frame into two segements.
# The first segment contains all but the last column.
# The second segment contains only the last column.
# The partitioned data frame represents the feature observation matrix
def seperate_data(data_frame):
    # Shuffle the input data to ensure 
    # there are no ordering biases
    data_frame  = data_frame.sample(frac=1, random_state=STATIC_SEED)
    labelColumn = data_frame.columns[-1]
    X = data_frame.loc[:, data_frame.columns != labelColumn]
    Y = data_frame[labelColumn]
    return X, Y

# Takes a feature observation matrix and a label vector along with
# a specification for the relative sizes of the requested partitions.
# Returns the inputs partitioned into 4 sets, respectively:
#   - Full Training
#   - Partial Training
#   - Validation
#   - Testing
#
# The partitions have the following relationships:
#   - Full Training ∪ Testing = Input
#   - Full Training = Partial Training ∪ Validation
#
# Intended to be convieient for model selection and tuning.
def train_valid_test(X_in, Y_in, validation_size, test_size):
    splitter = lambda x, y, n: train_test_split(x, y, test_size=n, random_state=STATIC_SEED)
    X_full,  X_test,  Y_full,  Y_test  = splitter(X_in, Y_in, test_size)
    X_train, X_valid, Y_train, Y_valid = splitter(X_full, Y_full, validation_size)
    return X_full, X_train, X_valid, X_test, Y_full, Y_train, Y_valid, Y_test

# Define a reusable descriptor for data sets.
# Nicely renders the dimensions of the provided data set.
def describe_data_set(X, label):
    rStr = str(X.shape[0])
    cStr = str(X.shape[1])
    mLen = max(len(rStr),len(cStr))
    print(label)
    print(" ",rStr.rjust(mLen), "observations")
    print(" ",cStr.rjust(mLen), "features")

# I defined this function to perform binary classification using
# the computed threshhold value from ROC curve analysis.
# However it performs worse than the default "predict" function.
# It remains here for reference.
def predict_with_threshold(clf, X, t):
    # First we define a transformation;
    # taking a classification probability and
    # returning a binary classification
    # based on the supplied optimal threshold.
    exceeds_threshold = lambda x: 1 if (max(x) - min(x)) >= t else 0

    # Prediction probabilities of the test data set.
    Y = clf.predict_proba(X)

    # Binary classifiactions
    return np.fromiter(map(exceeds_threshold, Y), np.int32)

# Nicely render a confusion matrix.
# Much better than looking at directly at the numpy array.
def inspect_confusion_matrix(Y_true, Y_pred):
    matrix = metrics.confusion_matrix(Y_true, Y_pred)
    maxVal = max(np.concatenate(matrix).flat, key=lambda x: x)
    padLen = len(str(maxVal))
    
    print("Confusion matrix:")
    for row in matrix:
        print("  ", sep='', end='')
        sum = 0
        for col in row:
            sum += col
            print(str(col).rjust(padLen), " ", sep='', end='')
        print("|=", sum)
                
# We tune the model by determining which hyperparamaters perform best.
def model_selection(classifier, param_grid, X_train_part, Y_train_part):
    result_classifier = GridSearchCV(classifier, param_grid, scoring='accuracy', cv=4, verbose=1, n_jobs=-1)
    result_classifier.fit(X_train_part, Y_train_part)
    best_hyperparameters = result_classifier.best_params_
    print("Best accuracy score found:  ", round(result_classifier.best_score_,4))
    return best_hyperparameters


def dropColumn(df, colName):
    if colName in df.columns:
        df.drop(colName, 1, inplace=True)


def setType(df, colName, colType):
    df[colName] = df[colName].astype(colType)
        

def oneHotBitEncodeColumn(df, colName, prefix=None):
    if colName not in df.columns:
        return
    spot = df.columns.get_loc(colName)
    cols = pd.get_dummies(df[colName], prefix)
    ordered = sorted(cols.columns)
    ordered.reverse()
    for name in ordered:
        df.insert(spot, name, cols[name])
    df.drop(colName, 1, inplace=True)


def inclusionBitEncodeColumn(df, colName, prefix=None):
    if colName not in df.columns:
        return
    values  = pd.DataFrame(df[colName].values.tolist()).stack().values
    uniques = functools.reduce(lambda a,b: set(b.split(',')).union(a), values, set())
    colSpot = df.columns.get_loc(colName)
    ordered = sorted(uniques)
    ordered.reverse()
    for val in ordered:
        col = df[colName].map(lambda x: val in set(str(x).split(',')))
        pref = colName
        if prefix is not None:
            pref = prefix
        df.insert(colSpot, pref + "_" + val, col.astype(np.uint8))
    df.drop(colName, 1, inplace=True)

### 0.3  Load data.

In [3]:
item_data = pd.read_csv('magic-items.csv',sep=',')

### 0.4 Do a quick check of the data and print out the following information about the data.

### 0.4.1 Get dimension of data

In [4]:
describe_data_set(item_data,'Raw Data')

Raw Data
  1446 observations
    34 features


### 0.4.2 Look at top 5 rows

In [5]:
item_data.head()

Unnamed: 0,name,rarity,attunementRequired,cursed,sentient,weight,baseItem,itemType,conditionImmunities,damageImmunities,...,requiredStrength,stealth,poison,tattoo,savingThrowBonus,spellAttackBonus,spellSaveDCBonus,charges,recharge,attachedSpells
0,Antimatter Rifle,none,0,0,0,10.0,,R,,,...,0,0,0,0,0,0,0,0,,
1,Arrow,none,0,0,0,0.05,,A,,,...,0,0,0,0,0,0,0,0,,
2,Arrows (20),none,0,0,0,1.0,,A,,,...,0,0,0,0,0,0,0,0,,
3,Automatic Pistol,none,0,0,0,3.0,,R,,,...,0,0,0,0,0,0,0,0,,
4,Automatic Rifle,none,0,0,0,8.0,,R,,,...,0,0,0,0,0,0,0,0,,


### 0.4.3  Get a description of the measurements

In [6]:
item_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1446 entries, 0 to 1445
Data columns (total 34 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   name                 1446 non-null   object 
 1   rarity               1446 non-null   object 
 2   attunementRequired   1446 non-null   int64  
 3   cursed               1446 non-null   int64  
 4   sentient             1446 non-null   int64  
 5   weight               1446 non-null   float64
 6   baseItem             118 non-null    object 
 7   itemType             905 non-null    object 
 8   conditionImmunities  3 non-null      object 
 9   damageImmunities     7 non-null      object 
 10  damageResistances    87 non-null     object 
 11  firearm              1446 non-null   int64  
 12  range                61 non-null     object 
 13  weapon               1446 non-null   int64  
 14  weaponBonus          1446 non-null   int64  
 15  weaponCategory       184 non-null    o

### 0.4.4 Get the class distribution.

In [7]:
item_data.describe()

Unnamed: 0,attunementRequired,cursed,sentient,weight,firearm,weapon,weaponBonus,grantsProficiency,armor,armorClassBonus,armorClassFixed,requiredStrength,stealth,poison,tattoo,savingThrowBonus,spellAttackBonus,spellSaveDCBonus,charges
count,1446.0,1446.0,1446.0,1446.0,1446.0,1446.0,1446.0,1446.0,1446.0,1446.0,60.0,1446.0,1446.0,1446.0,1446.0,1446.0,1446.0,1446.0,1446.0
mean,0.366528,0.014523,0.021438,5.192067,0.006916,0.059474,0.140387,0.017981,0.00899,0.039419,11.933333,0.142462,0.019364,0.01314,0.03112,0.019364,0.045643,0.030429,0.659751
std,0.482023,0.119674,0.144891,29.850221,0.082901,0.236592,0.554629,0.132927,0.094423,0.245024,5.662444,1.442953,0.137848,0.113912,0.173703,0.161004,0.312361,0.263935,2.99395
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,-2.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,15.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,600.0,1.0,1.0,3.0,1.0,1.0,3.0,18.0,15.0,1.0,1.0,1.0,2.0,3.0,3.0,50.0


## §1 Feature Extraction & Selection

### 1.1 Generate a bar plot to display the class distribution.

In [8]:
%%time

skip = True

# Calculate correlation coefficient
def corrfunc(x, y, **kws):
    r, _ = stats.pearsonr(x, y)
    ax = plt.gca()
    ax.annotate("r = {:.2f}".format(r),
                xy=(.1, .6), xycoords=ax.transAxes,
               size = 24)

if not skip:
    cmap = sns.cubehelix_palette(light=1, dark = 0.1, hue = 0.5, as_cmap=True)
    sns.set_context(font_scale=2)
    # Pair grid set up
    g = sns.PairGrid(item_data)
    # Scatter plot on the upper triangle
    g.map_upper(plt.scatter, s=10, color = 'red')
    # Distribution on the diagonal
    g.map_diag(sns.displot, kde=False, color = 'red')
    # Density Plot and Correlation coefficients on the lower triangle
    g.map_lower(sns.kdeplot, cmap = cmap)
    g.map_lower(corrfunc);

CPU times: user 2 µs, sys: 2 µs, total: 4 µs
Wall time: 5.48 µs


### 1.2 Transform 'Rarity' measurment

In [9]:
def rarity_transform(x):
    if x == "unknown (magic)" or x == "varies":
        return "unknown"
    return x

item_data['rarity'] = item_data['rarity'].map(rarity_transform)

print(pd.DataFrame(item_data.rarity.values.tolist()).stack().value_counts())

oneHotBitEncodeColumn(item_data, 'rarity')

none         530
rare         254
uncommon     193
very rare    163
legendary    114
common        82
unknown       68
artifact      42
dtype: int64


### 1.3 Transform 'Range' measurment

In [10]:
item_data[['range_normal', 'range_long']] = item_data['range'].str.split('/', 1, expand=True)
item_data.drop('range', 1, inplace=True)
item_data['range_normal'] = item_data['range_normal'].astype('float').astype('Int64')
item_data['range_long'  ] = item_data['range_long'  ].astype('float').astype('Int64')

### 1.4 Transform 'Weapon Category' measurment

In [11]:
oneHotBitEncodeColumn(item_data, 'weaponCategory', 'weaponCategory')

### 1.5 Transform 'Weapon Properties' measurment

In [12]:
inclusionBitEncodeColumn(item_data, 'weaponProperty')

### 1.6 Transform 'Damage' measurments

In [13]:
def averageDamage(s):
    toks = s.split('D')
    if len(toks) == 0:
        return 0
    elif len(toks) == 1:
        return float(toks[0])
    else:
        float(toks[0]) * (float(toks[1]) + 1) / 2

def averageDamageEncode(df, colName):
    if df[colName].dtypes == object:
        df[colName] = df[colName].map(averageDamage)

averageDamageEncode(item_data, 'damage1')
averageDamageEncode(item_data, 'damage2')

### 1.7 Transform 'Damage Type' measurments

In [14]:
oneHotBitEncodeColumn(item_data, 'damage1Type', 'damage1Type')

### 1.8 Transform 'Recharge' measurments

In [15]:
item_data['recharge'] = item_data['recharge'].notnull().astype(np.uint8)

### 1.9 Transform 'Damage Resistances' measurments

In [16]:
inclusionBitEncodeColumn(item_data, 'damageResistances', 'resist')
item_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1446 entries, 0 to 1445
Data columns (total 72 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    1446 non-null   object 
 1   artifact                1446 non-null   uint8  
 2   common                  1446 non-null   uint8  
 3   legendary               1446 non-null   uint8  
 4   none                    1446 non-null   uint8  
 5   rare                    1446 non-null   uint8  
 6   uncommon                1446 non-null   uint8  
 7   unknown                 1446 non-null   uint8  
 8   very rare               1446 non-null   uint8  
 9   attunementRequired      1446 non-null   int64  
 10  cursed                  1446 non-null   int64  
 11  sentient                1446 non-null   int64  
 12  weight                  1446 non-null   float64
 13  baseItem                118 non-null    object 
 14  itemType                905 non-null    

## §2 Feature Selection

### 2.1 Drop null dominated columns

In [17]:
# These columns are are over 99.9% null
dropColumn(item_data, 'conditionImmunities')
dropColumn(item_data, 'damageImmunities'   )
dropColumn(item_data, 'damage1Type_O'      )
dropColumn(item_data, 'damage1Type_N'      )
dropColumn(item_data, 'damage1Type_R'      )

### 2.2 Drop string labeling columns

In [18]:
dropColumn(item_data, 'name'          )
dropColumn(item_data, 'baseItem'      )
dropColumn(item_data, 'itemType'      )
dropColumn(item_data, 'attachedSpells')

### 2.3 Filter out rows with a negative saving throw bonus

In [19]:
item_data.drop( item_data[ item_data['savingThrowBonus'] < 0 ].index , inplace=True)

## §3 Data Preparation

### 3.1 Impute data

In [20]:
nanCols = {'armorClassFixed', 'damage1', 'damage2', 'range_normal', 'range_long'}

for col in nanCols:
    item_data[col] = item_data[col].fillna(0)


### 3.2 Reduce bit widths

In [21]:
ubits8 =    { 'attunementRequired'
            , 'armorClassFixed'
            , 'cursed'
            , 'sentient'
            , 'firearm'
            , 'weapon'
            , 'weaponBonus'
            , 'grantsProficiency'
            , 'armor'
            , 'armorClassBonus'
            , 'requiredStrength'
            , 'stealth'
            , 'poison'
            , 'tattoo'
            , 'savingThrowBonus'
            , 'spellAttackBonus'
            , 'spellSaveDCBonus'
            , 'charges'
            }

ubits16 =   { 'range_normal'
            , 'range_long'
            }

for col in ubits8:
    setType(item_data, col, np.uint8)

for col in ubits16:
    setType(item_data, col, np.uint16)

In [22]:
item_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1445 entries, 0 to 1445
Data columns (total 63 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   artifact                1445 non-null   uint8  
 1   common                  1445 non-null   uint8  
 2   legendary               1445 non-null   uint8  
 3   none                    1445 non-null   uint8  
 4   rare                    1445 non-null   uint8  
 5   uncommon                1445 non-null   uint8  
 6   unknown                 1445 non-null   uint8  
 7   very rare               1445 non-null   uint8  
 8   attunementRequired      1445 non-null   uint8  
 9   cursed                  1445 non-null   uint8  
 10  sentient                1445 non-null   uint8  
 11  weight                  1445 non-null   float64
 12  resist_acid             1445 non-null   uint8  
 13  resist_bludgeoning      1445 non-null   uint8  
 14  resist_cold             1445 non-null   

### 3.1 Create Train and Test Dataset.

In [23]:
# Split off the last column as the label vector.
# NOTE: See implementation of 'seperate_data' in first python cell.
X, Y = seperate_data(item_data)

### 3.2 Create Train and Test Dataset.

In [24]:
# Partition out data into training, validation and testing sets.
# NOTE: See implementation of 'train_valid_test' in first python cell.
X_train_full, X_train_part, X_valid, X_test, Y_train_full, Y_train_part, Y_valid, Y_test = train_valid_test(X, Y, 0.2, 0.2)

## §4 Naive Bayes

### 4.1 Multinomial Naive Bayes

In [31]:
%%time

# Define a reusable descriptor of a data set
def describe_data_set(X, label):
    rStr = str(X.shape[0])
    cStr = str(X.shape[1])
    mLen = max(len(rStr),len(cStr))
    print(label)
    print(" ",rStr.rjust(mLen), "observations")
    print(" ",cStr.rjust(mLen), "features")

# First, we want to determine the best hyperparameters.
# To do so we generate a list of potential values.
param_grid =    { 'alpha'     : [10**(i - 4) for i in range(0,9)]
                , 'fit_prior' : [False, True]
                }

best_hyperparameters =  { 'alpha': 0.0001
                        , 'fit_prior': True
                        }

# If we don't already have best parameters...
# Let's go find them!
if best_hyperparameters == None:
    best_hyperparameters = model_selection(MultinomialNB(), param_grid, X_train_part, Y_train_part)

classifier_NBM = MultinomialNB(**best_hyperparameters)
Y_score = classifier_NBM.fit(X_train_part, Y_train_part) 

print("Built the Multinomial Naïve Bayes model")
print("Using hyperparameters:")
for k,v in best_hyperparameters.items():
    print(" ",k,"=",v)
describe_data_set(X_train_part, "Partial training set containing:")
print()

Built the Multinomial Naïve Bayes model
Using hyperparameters:
  alpha = 0.0001
  fit_prior = True
Partial training set containing:
  924 observations
   62 features

CPU times: user 14.1 ms, sys: 23.5 ms, total: 37.6 ms
Wall time: 8.86 ms
