In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Exploration

In [None]:
df = pd.read_csv('/kaggle/input/dnd_monsters.csv')
df.info(verbose=True)
df.head(5)

# Preprocessig / Feature Engineering

Adopting the following strategy;

* Drop all rows with missing values
* Encode the "size" column - we can safely do this using standard label encoding as the sizes have a very clear order / are all related to one another.

In [None]:
# Convert 'cr' column to float in seperate column
df['cr_float'] = df['cr']
df['cr_float'] = df['cr_float'].replace('1/8', 0.125)
df['cr_float'] = df['cr_float'].replace('1/4', 0.25)
df['cr_float'] = df['cr_float'].replace('1/2', 0.5)
df['cr_float'] = df['cr_float'].astype('float')
df["cr"] = df["cr"].astype('category')

# Change the "legendary" column to a boolean 0/1
df["legendary"] = df["legendary"].fillna(0)
df["legendary"] = df["legendary"].replace('Legendary', 1)

# Change the "size" column to an integer
sizes = ["Tiny", "Small", "Medium", "Large", "Huge", "Gargantuan"]
df["size"] = df["size"].astype(pd.CategoricalDtype(sizes, ordered=True))
df["size"] = df["size"].cat.codes

# These are the features we'll use to train 
features = [
    'str', 'dex', 'con', 'int', 'wis', 'cha', 'hp', 'ac', 'size', 'legendary'
]

# Drop any rows with missing features or that are missing the target value (cr)
df = df.dropna(subset=features + ['cr'])
df.info(verbose=True)
df[features].head(5)

# Classification approach
Is this a classificaiton problem? Evidently not. The best accuracy I was able to achieve (without any hyperparameter tuning) was around 0.3 - so one in every 3-ish creatures were being given the wrong challenge rating.

The fact it was able to get that close is pretty good anyway, however I think this is almost definately more of a regression problem - if we get "close" to the challenge rating we want then that still counts as a win.

In [None]:
from sklearn import svm
from sklearn import tree
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X = df[features]
y = df["cr"]

# Train
models = {
    "SVM": svm.SVC(),
    "SGD": SGDClassifier(loss="hinge", penalty="l2", max_iter=500),
    "Tree": tree.DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(max_depth=2, random_state=0)
}

def train_and_evaluate(model_type):
    # Train
    test_size = 0.3
    seed = 42
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
    model = model_type.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)    
    print("Accuracy:", accuracy_score(y_test, y_pred))

for key in models:
    print("\n")
    print(key)
    print("==========")
    train_and_evaluate(models[key])

# Regression approach

Some ridiculously impressive scores for a few regression models.[](http://)

In [None]:
from sklearn import svm
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.model_selection import train_test_split

X = df[features]
y = df["cr_float"]

# Train
models = {
    "SVC": svm.SVR(),
    "SGD": linear_model.SGDRegressor(),
    "Bayesian Ridge": linear_model.BayesianRidge(),
    "Lasso": linear_model.LassoLars(),
    "ARD": linear_model.ARDRegression(),
    "PassAggr": linear_model.PassiveAggressiveRegressor(),
    "Theil": linear_model.TheilSenRegressor(),
    "Linear": linear_model.LinearRegression()
}

def train_and_evaluate(model_type):
    # Train
    test_size = 0.3
    seed = 42
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
    model = model_type.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)

    print("Mean absolute error:", mean_absolute_error(y_test, y_pred))
    print("Mean squared error:", mean_squared_error(y_test, y_pred))
    print("Explained Variance:", explained_variance_score(y_test, y_pred))

for key in models:
    print("\n")
    print(key)
    print("==========")
    train_and_evaluate(models[key])