# Preprossing Data for ML Models

In [1]:
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

import joblib

In [2]:
# load data

crimes_df = pd.read_csv('Crimes_Dataset.csv')
suspects_df = pd.read_csv('Suspects_Dataset.csv')

In [3]:
# convert everything to lowercase

def lower(df):
    df.columns = df.columns.str.lower()
    df = df.apply(lambda col:col.str.lower() if col.dtype == 'object' else col)
    return df

crimes_df = lower(crimes_df)
suspects_df = lower(suspects_df)

In [4]:
# merge Crime_Dataset.csv and Suspects_Dataset.csv using index to match

df = pd.merge(crimes_df, suspects_df, left_on = 'index_crimes', right_on = 'importtindex_monster')

In [5]:
# create features data set

X = df.drop(columns = ['monster involved','index_crimes','date','monster', 'importtindex_monster'], axis = 1)
X.head(5)

Unnamed: 0,days of investigation,region,crime type,crime weapon,time of day,evidence found,criminal record,age,gender,height in cm,speed level,strength level,allergy,favorite food
0,77.0,mountain,nightly disturbance,,night,bones,yes,1.0,f,152.0,26.0,8.0,silver,lasagna
1,48.0,forest,assault,pistol,night,teeth,no,53.0,m,171.0,87.0,7.0,silver,pesto pasta
2,31.0,castle,nightly disturbance,,night,potions,no,1634.0,m,185.0,59.0,2.0,sunlight,humans
3,29.0,swamp,arson,pistol,dawn,potions,no,322.0,m,169.0,23.0,3.0,pumpkin,lasagna
4,59.0,village,nightly disturbance,pistol,night,potions,no,757.0,f,170.0,36.0,10.0,garlic,rats


In [6]:
# identify input features

categorical_features = X.select_dtypes(include = ['object']).columns
numerical_features = X.select_dtypes(include = ['number']).columns

In [7]:
# create target data set

Y = df['monster']
Y.head(5)

0    skeleton
1    werewolf
2       ghost
3       witch
4       ghost
Name: monster, dtype: object

In [8]:
# handle missing values

# fill in missing 'crime weapon' with 'unknown'
X['crime weapon'] = X['crime weapon'].fillna('unknown')

# impute missing numerical data
numerical_transformer = SimpleImputer(strategy='mean')

# impute missing categorical data & one-hot encode
categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown = 'ignore'))
])

In [9]:
# combine numerical & categorical data

preprocessor = ColumnTransformer(
    transformers = [
        ('numerical', numerical_transformer, numerical_features),
        ('categorical', categorical_transformer, categorical_features)
    ]
)

In [10]:
# split train & test data set

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [11]:
# create a dictionary for preprocessor, X, Y

data_and_preprocessor_for_ml = {
    'preprocessor': preprocessor,
    'X': X,
    'X_train': X_train,
    'X_test': X_test,
    'Y': Y,
    'Y_train': Y_train,
    'Y_test': Y_test
}

In [12]:
# save preprocessor as pickle file for machine learning models

joblib.dump(data_and_preprocessor_for_ml, 'data_and_preprocessor_for_ml.joblib')

['data_and_preprocessor_for_ml.joblib']