In [29]:
import pandas as pd
import numpy as np

In [30]:
# 1. LOAD THE DATA
df = pd.read_csv('Suspects_Dataset.csv')


In [31]:
# 2. INITIAL EXPLORATION (what you already ran)
print("=== INITIAL DATA OVERVIEW ===")
df.info()
print("\n=== INITIAL MISSING VALUES ===")
print(df.isnull().sum())

=== INITIAL DATA OVERVIEW ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   importtIndex_Monster  10000 non-null  int64  
 1   Monster               10000 non-null  object 
 2   Criminal record       10000 non-null  object 
 3   Age                   9995 non-null   float64
 4   Gender                10000 non-null  object 
 5   Height in cm          10000 non-null  float64
 6   Speed Level           9999 non-null   float64
 7   Strength Level        9996 non-null   float64
 8   Allergy               10000 non-null  object 
 9   Favorite Food         9998 non-null   object 
dtypes: float64(4), int64(1), object(5)
memory usage: 781.4+ KB

=== INITIAL MISSING VALUES ===
importtIndex_Monster    0
Monster                 0
Criminal record         0
Age                     5
Gender                  0
Height in cm            

In [32]:
# 3 FIND SPECIFIC ROWS WITH MISSING DATA (NEW)
print("\n=== MONSTERS WITH MISSING DATA ===")
missing_rows = df[df.isnull().any(axis=1)]
print(f"Number of monsters with missing data: {len(missing_rows)}")
print(missing_rows[['Monster', 'Age', 'Speed Level', 'Strength Level', 'Favorite Food']])


=== MONSTERS WITH MISSING DATA ===
Number of monsters with missing data: 12
       Monster    Age  Speed Level  Strength Level Favorite Food
278   Skeleton    NaN         30.0             6.0   pesto pasta
439     Zombie    NaN         12.0             1.0       lasagna
1083   Vampire  752.0         32.0             4.0           NaN
1643     Witch    NaN         24.0             5.0         brain
1873  Werewolf  123.0         62.0             6.0           NaN
1918  Werewolf   58.0         95.0             NaN          rats
2194    Zombie    NaN         17.0             1.0          rats
3903   Vampire  440.0         37.0             NaN          rats
4027   Vampire  611.0          NaN             6.0       lasagna
4565     Witch  722.0         17.0             NaN          rats
5068  Werewolf  137.0         77.0             NaN       lasagna
7216    Zombie    NaN         23.0             5.0         brain


In [33]:
# 4. samrt filling missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Speed Level'].fillna(df['Speed Level'].median(), inplace=True)
df['Strength Level'].fillna(df['Strength Level'].median(), inplace=True)
df['Favorite Food'].fillna(df['Favorite Food'].mode()[0], inplace=True)

df

Unnamed: 0,importtIndex_Monster,Monster,Criminal record,Age,Gender,Height in cm,Speed Level,Strength Level,Allergy,Favorite Food
0,2101,Witch,No,526.0,F,197.0,16.0,1.0,hazelnut,brain
1,9299,Zombie,Yes,46.0,M,184.0,20.0,7.0,cat hair,humans
2,6824,Vampire,No,442.0,M,192.0,21.0,7.0,sunlight,pesto pasta
3,6773,Vampire,No,551.0,F,167.0,43.0,6.0,garlic,humans
4,9624,Zombie,Yes,150.0,F,176.0,17.0,1.0,pumpkin,brain
...,...,...,...,...,...,...,...,...,...,...
9995,4633,Skeleton,Yes,1836.0,F,152.0,1.0,2.0,silver,rats
9996,6713,Werewolf,No,77.0,F,192.0,74.0,4.0,silver,rats
9997,5535,Witch,No,484.0,M,175.0,18.0,2.0,garlic,pesto pasta
9998,8635,Vampire,No,824.0,F,152.0,31.0,5.0,silver,lasagna


In [34]:
# make everything lower case

object_cols = df.select_dtypes(include='object').columns
for attribute in object_cols:
    df[attribute] = df[attribute].str.lower()
    print(df[attribute].unique())

['witch' 'zombie' 'vampire' 'skeleton' 'werewolf' 'ghost']
['no' 'yes']
['f' 'm']
['hazelnut' 'cat hair' 'sunlight' 'garlic' 'pumpkin' 'silver']
['brain' 'humans' 'pesto pasta' 'rats' 'lasagna']


In [35]:
#
columns = df.columns.values

def encode(col):
    code, unique = pd.factorize(df[col])
    df[col] = code
    mapping = dict(enumerate(unique))
    print(mapping)

encode('Gender')
encode('Criminal record')


{0: 'f', 1: 'm'}
{0: 'no', 1: 'yes'}


In [36]:
df['Age'].max()
df['Age'] = pd.qcut(df['Age'], 6, labels = ['baby','teen', 'young adult', 'adult', 'middle age', 'old'])

In [37]:
print(df['Speed Level'].unique())
print(df['Speed Level'].min())
print(df['Speed Level'].max())

[ 16.  20.  21.  43.  17.  12.  38.  64.  22.  83.  78.  19.   7.   1.
  49.   9.   3.  32.  31.  11.  41.  67.  18.  46.  79.  28.  26.  29.
  90.  15.  13.  51.  55.  36. 100.  74.  60.  59.  69.  71.  52.  40.
  73.  30.  23.  39.  70.  24.   6.  37.  57.  93.  89.  48.  27.  14.
  97.  75.  33.  42.  54.  25.  10.   2.  58.   4.  50.  47.  35.  62.
  63.  44.  53.  34.  76.   8.  72.  66.  45.  65.  77.  68.  80.  61.
  92.   5.  96.  86.  85.  82.  88.  98.  94.  56.  95.  91.  99.  81.
  84.  87.]
1.0
100.0


In [38]:
df['Speed Level'] = df['Speed Level'] / df['Speed Level'].max()
df

Unnamed: 0,importtIndex_Monster,Monster,Criminal record,Age,Gender,Height in cm,Speed Level,Strength Level,Allergy,Favorite Food
0,2101,witch,0,adult,0,197.0,0.16,1.0,hazelnut,brain
1,9299,zombie,1,baby,1,184.0,0.20,7.0,cat hair,humans
2,6824,vampire,0,adult,1,192.0,0.21,7.0,sunlight,pesto pasta
3,6773,vampire,0,adult,0,167.0,0.43,6.0,garlic,humans
4,9624,zombie,1,young adult,0,176.0,0.17,1.0,pumpkin,brain
...,...,...,...,...,...,...,...,...,...,...
9995,4633,skeleton,1,old,0,152.0,0.01,2.0,silver,rats
9996,6713,werewolf,0,baby,0,192.0,0.74,4.0,silver,rats
9997,5535,witch,0,adult,1,175.0,0.18,2.0,garlic,pesto pasta
9998,8635,vampire,0,middle age,0,152.0,0.31,5.0,silver,lasagna


In [39]:
print(df['Strength Level'].min())
print(df['Strength Level'].max())

1.0
10.0


In [40]:
df['Strength Level'] = df['Strength Level'] / df['Strength Level'].max()
df

Unnamed: 0,importtIndex_Monster,Monster,Criminal record,Age,Gender,Height in cm,Speed Level,Strength Level,Allergy,Favorite Food
0,2101,witch,0,adult,0,197.0,0.16,0.1,hazelnut,brain
1,9299,zombie,1,baby,1,184.0,0.20,0.7,cat hair,humans
2,6824,vampire,0,adult,1,192.0,0.21,0.7,sunlight,pesto pasta
3,6773,vampire,0,adult,0,167.0,0.43,0.6,garlic,humans
4,9624,zombie,1,young adult,0,176.0,0.17,0.1,pumpkin,brain
...,...,...,...,...,...,...,...,...,...,...
9995,4633,skeleton,1,old,0,152.0,0.01,0.2,silver,rats
9996,6713,werewolf,0,baby,0,192.0,0.74,0.4,silver,rats
9997,5535,witch,0,adult,1,175.0,0.18,0.2,garlic,pesto pasta
9998,8635,vampire,0,middle age,0,152.0,0.31,0.5,silver,lasagna


In [41]:
# create separate columns for each category
df_encoded = pd.get_dummies(df, columns=['Monster', 'Age', 'Allergy', 'Favorite Food'])
df


Unnamed: 0,importtIndex_Monster,Monster,Criminal record,Age,Gender,Height in cm,Speed Level,Strength Level,Allergy,Favorite Food
0,2101,witch,0,adult,0,197.0,0.16,0.1,hazelnut,brain
1,9299,zombie,1,baby,1,184.0,0.20,0.7,cat hair,humans
2,6824,vampire,0,adult,1,192.0,0.21,0.7,sunlight,pesto pasta
3,6773,vampire,0,adult,0,167.0,0.43,0.6,garlic,humans
4,9624,zombie,1,young adult,0,176.0,0.17,0.1,pumpkin,brain
...,...,...,...,...,...,...,...,...,...,...
9995,4633,skeleton,1,old,0,152.0,0.01,0.2,silver,rats
9996,6713,werewolf,0,baby,0,192.0,0.74,0.4,silver,rats
9997,5535,witch,0,adult,1,175.0,0.18,0.2,garlic,pesto pasta
9998,8635,vampire,0,middle age,0,152.0,0.31,0.5,silver,lasagna


In [42]:
# --- CATEGORICAL ENCODING STEP ---
import pandas as pd

# Create a dictionary to store our mappings
label_mappings = {}

# List of categorical columns to encode
categorical_columns = ['Monster', 'Age', 'Allergy', 'Favorite Food']

print("Starting categorical encoding with pandas...")

# Apply encoding to each categorical column
for col in categorical_columns:
    print(f"Encoding column: {col}")
    
    # Use pandas factorize - creates numerical codes
    df[f'{col}_encoded'], unique_categories = pd.factorize(df[col])
    
    # Save the mapping for future reference
    label_mappings[col] = dict(zip(range(len(unique_categories)), unique_categories))
    
    # Show the mapping
    print(f"  {col} mapping:")
    for code, category in enumerate(unique_categories):
        print(f"    {code} → '{category}'")
    print()

print("Encoding completed!")
print(f"New columns added: {[f'{col}_encoded' for col in categorical_columns]}")

Starting categorical encoding with pandas...
Encoding column: Monster
  Monster mapping:
    0 → 'witch'
    1 → 'zombie'
    2 → 'vampire'
    3 → 'skeleton'
    4 → 'werewolf'
    5 → 'ghost'

Encoding column: Age
  Age mapping:
    0 → 'adult'
    1 → 'baby'
    2 → 'young adult'
    3 → 'teen'
    4 → 'middle age'
    5 → 'old'

Encoding column: Allergy
  Allergy mapping:
    0 → 'hazelnut'
    1 → 'cat hair'
    2 → 'sunlight'
    3 → 'garlic'
    4 → 'pumpkin'
    5 → 'silver'

Encoding column: Favorite Food
  Favorite Food mapping:
    0 → 'brain'
    1 → 'humans'
    2 → 'pesto pasta'
    3 → 'rats'
    4 → 'lasagna'

Encoding completed!
New columns added: ['Monster_encoded', 'Age_encoded', 'Allergy_encoded', 'Favorite Food_encoded']
