In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# DnD 5e Monsters CR Analysis
DnD 5e has a plethora of monsters in its universe. A DM who is designing a combat encounter for their players usually builds the encounter by using monsters with a **Challenge Rating (CR)** that is similar to the adventuring party's average level. However, many DMs like to create their own monsters to bring something unique to their encounter. But how do you decide a custom monster's CR? To answer this question and to better understand which characteristics are driving factors I will be building a machine learning model to predict a monster's CR.

# Data Files
I'll be using and modifying the Dataset uploaded by [mrpantherson][1]. This data was scraped from [AideDD][2] and additional information on the dataset can be found [here][3]. In addition a second anonymous dataset was used to update some missing information and it can be found [here][4]

[1]: https://www.kaggle.com/mrpantherson
[2]: https://www.aidedd.org/
[3]: https://www.kaggle.com/mrpantherson/dnd-5e-monsters
[4]: https://docs.google.com/spreadsheets/d/1FIjaz6S0JXrXaCVhHEDeq-nH7xHzlqAx6inuRbDjhjU/edit?usp=sharing

# Formatting the Data
The 'Name' columns in both datasets need both be reformatted so that they can merge correctly. Additionally some of the columns need to have their data type changed to reflect their numerical nature. The target column 'cr' needs to have certain values modified before its data type can be changed (some of the values are fractions denoted as strings like '1/4'). 'cr' also needs to be seperated from the dataframe since it is the target.

In [None]:
# read in datasets
monsters = pd.read_csv('/kaggle/input/dnd-5e-monsters/dnd_monsters.csv')
ability_scores = pd.read_csv('/kaggle/input/dnd-monsters-ability-scores/DnD Monster Ability Scores.csv')

# reformat the name column in both datasets so dataframes can merge properly on 'Name'
ability_scores['Name'] = ability_scores['Name'].str.lower()
ability_scores['Name'] = ability_scores['Name'].str.replace('-',' ')
monsters = monsters.rename(columns = {'name': 'Name'})
monsters['Name'] = monsters['Name'].str.replace('-',' ')

# remove obsolete columns that will be filled from other dataset
monsters.drop(['str', 'dex', 'con', 'int', 'wis', 'cha'], axis=1, inplace=True)

# set Int64 data type to reflect the numerical nature of these columns
ability_scores = ability_scores.astype({'STR': 'Int64', 'DEX': 'Int64', 'CON': 'Int64', 'INT': 'Int64', 'WIS': 'Int64', 'CHA': 'Int64'})

# convert the string fractions to decimal and set 'cr' as the float64 data type
monsters.loc[monsters['cr'] == '1/4', 'cr'] = .25
monsters.loc[monsters['cr'] == '1/2', 'cr'] = .5
monsters.loc[monsters['cr'] == '1/8', 'cr'] = .125
monsters.cr = monsters.cr.astype('float64')

# remove rows with 0 or null for 'cr' (they are considered errors) and seperate target from predictors
monsters = monsters.drop(monsters.loc[monsters['cr']==0].index)
monsters = monsters.drop(monsters.loc[monsters['cr'].isna()].index)
monsters_full = monsters[monsters.columns] #for later use
cr = monsters.cr
monsters.drop(['cr'], axis=1, inplace=True)


# convert binary categorical variable to 1's and 0's
monsters.loc[monsters['legendary'] == 'Legendary', 'legendary'] = 1
monsters['legendary'].fillna(0, inplace=True)

# merge data sets on 'Name' column
monsters = monsters.merge(ability_scores, on='Name', how='left')

## Features
Important information for each column from the monsters dataframe is listed below:

In [None]:
# display the features identifying their total unique values, number of missing values, and data type
print(pd.concat([monsters.nunique(),monsters.isna().sum(), monsters.dtypes], 
                axis=1).rename(columns={0:'Unique_Values',1:'Missing_Values', 2:'Data_Type'}))
print('Total Rows:', len(monsters.index))

## Columns to ignore
'name', 'url', 'align', and 'source' will not be useful in this model. Correlations could be identified between both 'align' and 'source' to 'cr', but they do not serve as useful predictors.

In [None]:
monsters.drop(['Name', 'url', 'align', 'source'], axis=1, inplace=True)

## Columns to adjust
'type' could be a useful categorical attribute to the model. However there is a problem: subtypes are included in parenthesis in certain rows, causing the column to have too high a cardinality for effective one-hot encoding. See below:

In [None]:
len(monsters['type'].value_counts())

The cardinality can be greatly reduced by removing the subtypes in parenthesis of each string value

In [None]:
monsters['type'] = monsters['type'].str.split(' ').str[0]
monsters['type'].value_counts()

The types of monsters have lost some specificity but they can now be used as a feature in the model.

## Columns that remain unchanged
'ac', 'hp', 'speed', the ability scores ('STR', 'DEX', 'CON', 'INT', 'WIS', 'CHA'), and 'legendary' were unchanged besides data type formatting.



# Missing Values
some of the columns in the dataframe are missing values. The columns with categorical data are not actually missing data but the missing numerical values in 'STR', 'DEX', 'CON', 'INT', 'WIS', 'CHA' need to be addressed.

In [None]:
# shows the features with both their total missing values and percentage of data missing
score_names = ['STR', 'DEX', 'CON', 'INT', 'WIS', 'CHA']
print('Original:', pd.concat([
    monsters[score_names].isna().sum(),
    monsters[score_names].isna().sum().divide(len(monsters.index)).map(lambda n: '{:.2%}'.format(n))],axis=1).
      rename(columns={0:'Missing_Values',1:'Percent_of_Data_Missing'}))
print('Total Rows:', len(monsters.index))

## K-NN Imputation of missing values

To fill these missing values KNNImputer will be utilized. All features other than the ones removed previously will be utilized to help improve the imputation. For the categorical features, drop_first will be enabled when encoding to avoid Dummy Variable Trap. The Dataframe will also be normalized via MinMaxScaler to reduce bias during imputation.

In [None]:
# import imputer and scaler from sklearn
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

# encode categorical variables, dropping the first of each
cat_variables = monsters[['type', 'size', 'speed']]
cat_dummies = pd.get_dummies(cat_variables, drop_first=True)

# create the dataframe for imputation
imp_df = monsters.drop(['type', 'size', 'speed'], axis=1)
imp_df = pd.concat([imp_df, cat_dummies], axis=1)

# apply scaler to imputation dataframe
scaler = MinMaxScaler()
imp_df = pd.DataFrame(scaler.fit_transform(imp_df), columns=imp_df.columns)

# apply imputation to missing values in dataframe and check for sucess
imputer = KNNImputer(n_neighbors=5)
imp_df = pd.DataFrame(imputer.fit_transform(imp_df), columns=imp_df.columns)

# revert scaler, round imputed values to integers and change the data type to int64
imp_df = pd.DataFrame(scaler.inverse_transform(imp_df), columns=imp_df.columns)
imp_df[score_names] = round(imp_df[score_names],0)
imputed_scores = imp_df[['STR', 'DEX', 'CON', 'INT', 'WIS', 'CHA']].astype('int64', errors='ignore')

# update main dataframe with imputed values
monsters = pd.concat([monsters.drop(score_names, axis=1), imputed_scores], axis=1)
print(pd.concat([monsters.nunique(),monsters.isna().sum(), monsters.dtypes], 
                axis=1).rename(columns={0:'Unique_Values',1:'Missing_Values', 2:'Data_Type'}))
print('Total Rows:', len(monsters.index))   

# The Model
A gradient boosting model (XGBoost) will be used to predict cr. The model will be evaluated using k-fold cross validation MAE as the small size of the data prevents a test set being extracted. 

In [None]:
# import XGBoost, sklearn functions, and numpy sort
from xgboost import XGBRegressor
from xgboost import plot_importance
from numpy import sort
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel


# Specify features (currently all are included besides those mentioned in 'Columns to Ignore')
feature_cols = ['ac', 'hp', 'legendary', 'STR', 'DEX', 'CON', 'INT', 'WIS', 'CHA',  'type', 'size', 'speed']

# encode data from desired features
features = pd.get_dummies(monsters[feature_cols])

# split validation set from training data
m_train, m_val, cr_train, cr_val = train_test_split(features, cr, train_size=.8, test_size=.2, random_state=0)

# define model
model = XGBRegressor(n_estimators=1000, learning_rate=.01)
model.fit(m_train, cr_train, early_stopping_rounds=50, eval_set=[(m_val, cr_val)], verbose=False)

# generate MAEs
mae = -1*cross_val_score(model, features, cr, cv=10, scoring='neg_mean_absolute_error')

# generate accuracy score
acc = model.score(m_val, cr_val)

# print model accuracy and average MAE
print("Average MAE across 5 folds:", mae.mean(),'\nAccuracy:',"{:.3%}".format(acc))


# Feature Importance and Selection
The feature importances will be identified and then used for feature selection using SelectFromModel

In [None]:
# disables SettingWithCopyWarning from displaying
pd.set_option('mode.chained_assignment', None)

# identify feature importances
temp = model.feature_importances_
column_names = m_train[0:0]
column_names.loc[len(column_names)] = temp
feature_importance = column_names.transpose().rename(columns= {0: 'feature importance'}).sort_values(by=['feature importance'], axis=0, ascending=False)
print(feature_importance)

# plot F-scores of each feature
plt.rcParams["figure.figsize"] = (14, 7)
plot_importance(model)
plt.show()

# iterate by thresholds of feature importance to determine their effect on the model ## NOTE: this section currently does not work as intended and needs to be revisited
# thresholds = sort(model.feature_importances_)
# for thresh in thresholds:
    
#     # select features using threshold
#     selection = SelectFromModel(model, threshold=thresh, prefit=True)
#     select_m_train = selection.transform(m_train)
    
#     # train selection model
#     selection_model = XGBRegressor(n_estimators=1000, learning_rate=.01)
#     selection_model.fit(select_m_train, cr_train)

#     # genearte and display mae of selection model
#     mae = -1*cross_val_score(selection_model, features, cr, cv=10, scoring='neg_mean_absolute_error')
#     print("Thresh=%.3f, n=%d, mae: %.3f" % (thresh, select_m_train.shape[1], mae.mean()))
    

# Results
The results of the model were mostly unsurprising. Many have anecdotally claimed that "CR is measured only by ac and hp" and it seems these claims have some merit. While ac interestingly does not have much importance to the model hp stands out as the dominating feature. A potential reason for such high feature importance is that hp tends to scale linearly with the cr of most monsters, while other features do not have anywhere near as strong a relationship. cr and ac also have a linear correlation as seen below but it is not as strong as cr and hp.

In [None]:
import seaborn as sns
sns.lmplot(x= 'cr', y= 'hp', data=monsters_full, fit_reg=True).fig.suptitle("hp and cr")
sns.lmplot(x= 'cr', y= 'ac', data=monsters_full, fit_reg=True).fig.suptitle("ac and cr")