# Data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.DataFrame(pd.read_csv('/kaggle/input/dnd-5e-monster-manual-stats/cleaned_monsters_basic.csv'))
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.head()

In [None]:
# fill null
numeric_columns = df.select_dtypes(include=['number']).columns
df[numeric_columns] = df[numeric_columns].fillna(0)
df.head()

# Explore

In [None]:
df.groupby('size')['ac'].mean().plot.bar(title='AC by creature size')

In [None]:
# convert 'cr' to float
df['cr'] = df['cr'].str.replace('1/8','.125')
df['cr'] = df['cr'].str.replace('1/4','.25')
df['cr'] = df['cr'].str.replace('1/2','.5')
df['cr'] = df['cr'].astype('float')

df.groupby('size')['cr'].mean().plot.bar(title='Challenge by creature size')

In [None]:
round(df.groupby('cr',0)[numeric_columns].mean(),2)

In [None]:
df.groupby('cr').size().plot.bar(title='Number of creatures by CR')

In [None]:
round(df.groupby('cr',0)[numeric_columns].median(),2)

# Basic ML

In [None]:
# feature selection
features_df = df.copy()
target = features_df.pop('cr')
selected_features = features_df

print(selected_features.shape, target.shape)

In [None]:
# one hot encoder and scaler for pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

ct = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include=object))
)

ct.fit_transform(selected_features)

In [None]:
# split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(selected_features, 
                                                    target, 
                                                    random_state=42)

In [None]:
# build pipeline
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso
from sklearn.pipeline import make_pipeline

model = LinearRegression()

pipe = make_pipeline(ct, model)

In [None]:
pipe.fit(X_train, y_train)

train_score = pipe.score(X_train, y_train)
test_score = pipe.score(X_test, y_test)
preds = pipe.predict(X_test)

print(f'Train score: {train_score}')
print(f'Test score: {test_score}')