# Preprocessing the Data
Author: Christina Vo
Dataset: Kaggle - Cat Breeds

In this notebook, it will provide the steps about preprocessing the data before using the dataset to train the cat classification model.


1. Import the dataset
2. Clean the data (fixing spelling errors, removing missing data, etc.)
3. One-hot encode features
4. Standardize features
5. Encode target labels
6. Split data into train and test
7. Export clean data for model training

In [1]:
# imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 1. Import the dataset

In [3]:
# read the dataset
df = pd.read_csv("cat_breeds_dirty.csv")
df.head()

Unnamed: 0,Breed,Age_in_years,Age_in_months,Gender,Neutered_or_spayed,Body_length,Weight,Fur_colour_dominant,Fur_pattern,Eye_colour,Allowed_outdoor,Preferred_food,Owner_play_time_minutes,Sleep_time_hours,Country,Latitude,Longitude
0,Angora,0.25,3.0,female,False,19.0,2.0,white,solid,blue,FALSE,wet,46.0,16.0,France,43.296482,5.36978
1,Angora,0.33,4.0,male,False,19.0,2.5,white,solid,blue,FALSE,wet,48.0,16.0,France,43.61166,3.87771
2,Angora,0.5,,,False,20.0,2.8,what does it mean dominant?,solid,green,I never allow my kitty outside!!!!!,wet,41.0,11.0,France,44.837789,-0.57918
3,Ankora,0.5,,,False,21.0,3.0,white,dirty,blue,FALSE,wet,24.0,8.0,France,43.61166,3.87771
4,Angora,0.5,,,,21.0,3.0,red/cream,tabby,green,FALSE,wet,51.0,10.0,france,48.864716,2.349014


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1103 entries, 0 to 1102
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Breed                    991 non-null    object 
 1   Age_in_years             1072 non-null   float64
 2   Age_in_months            1066 non-null   float64
 3   Gender                   1036 non-null   object 
 4   Neutered_or_spayed       1050 non-null   object 
 5   Body_length              1077 non-null   float64
 6   Weight                   1077 non-null   float64
 7   Fur_colour_dominant      1090 non-null   object 
 8   Fur_pattern              1055 non-null   object 
 9   Eye_colour               1064 non-null   object 
 10  Allowed_outdoor          1060 non-null   object 
 11  Preferred_food           1082 non-null   object 
 12  Owner_play_time_minutes  1082 non-null   float64
 13  Sleep_time_hours         1062 non-null   float64
 14  Country                 

In [5]:
df.describe()

Unnamed: 0,Age_in_years,Age_in_months,Body_length,Weight,Owner_play_time_minutes,Sleep_time_hours,Latitude,Longitude
count,1072.0,1066.0,1077.0,1077.0,1082.0,1062.0,1042.0,1042.0
mean,4.460752,53.778612,43.903435,5.740901,23.176525,15.898305,44.550898,-59.517623
std,3.262166,39.355581,16.240466,9.853438,10.815298,2.656775,4.931844,46.259368
min,-7.666667,-92.0,10.0,0.5,0.0,8.0,37.77493,-123.116226
25%,2.33,28.0,35.0,3.9,15.0,14.0,40.71427,-77.03637
50%,4.75,57.0,41.0,5.0,23.0,16.0,43.296482,-74.00597
75%,6.92,84.0,51.0,7.0,31.0,18.0,48.864716,-1.890401
max,11.25,135.0,102.0,320.0,60.0,32.0,53.800755,13.404954


# 2. Data Cleaning

In [6]:
# Fix spelling errors in breed column
df["Breed"] = df["Breed"].replace({"Ankora": "Angora"})
df["Breed"] = df["Breed"].replace({"Angorra": "Angora"})
df["Breed"] = df["Breed"].replace({"My coon": "Maine coon"})
df["Breed"] = df["Breed"].replace({"Maine loon": "Maine coon"})
df["Breed"] = df["Breed"].replace({"maine coon": "Maine coon"})
df["Breed"] = df["Breed"].replace({"rack doll": "Ragdoll"})
df["Breed"] = df["Breed"].replace({"ragdoll": "Ragdoll"})
df["Breed"] = df["Breed"].replace({"wrack doll": "Ragdoll"})

# age_in_years and age_in_months
df["Age_in_months"] = df["Age_in_months"].fillna(df["Age_in_years"] * 12)
df = df[(df["Age_in_years"] >= 0) & (df["Age_in_months"] >= 0)]

# weight
df.loc[df["Weight"] > 20, "Weight"] = df["Weight"] / 100

# fur_colour_dominant
df["Fur_colour_dominant"] = df["Fur_colour_dominant"].replace({"what does it mean dominant?": None})

# fur_pattern
df["Fur_pattern"] = df["Fur_pattern"].replace({"dirty": None})

# eye_colour
df["Eye_colour"] = df["Eye_colour"].replace({"cute": None})

# allowed_outdoor
df["Allowed_outdoor"] = df["Allowed_outdoor"].replace({
    "FALSE": False,
    "TRUE": True
})

#preferred_food
df["Preferred_food"] = df["Preferred_food"].replace({"a lot of food": None})

# country
df["Country"] = df["Country"].replace({"france": "France"})
df["Country"] = df["Country"].replace({"Vive la France!": "France"})
df["Country"] = df["Country"].replace({"my country": None})
df["Country"] = df["Country"].replace({"where I live": None})

In [7]:
# remove missing data
df = df.dropna()

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 741 entries, 0 to 1098
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Breed                    741 non-null    object 
 1   Age_in_years             741 non-null    float64
 2   Age_in_months            741 non-null    float64
 3   Gender                   741 non-null    object 
 4   Neutered_or_spayed       741 non-null    object 
 5   Body_length              741 non-null    float64
 6   Weight                   741 non-null    float64
 7   Fur_colour_dominant      741 non-null    object 
 8   Fur_pattern              741 non-null    object 
 9   Eye_colour               741 non-null    object 
 10  Allowed_outdoor          741 non-null    object 
 11  Preferred_food           741 non-null    object 
 12  Owner_play_time_minutes  741 non-null    float64
 13  Sleep_time_hours         741 non-null    float64
 14  Country                  741 n

# 3. Encode and Standardize

In [None]:
# one-hot encoding
# fur color to integers
fur_colors = df["Fur_colour_dominant"].unique()
fur_color_map = {color: i for i, color in enumerate(fur_colors)}
df["Fur_colour_num"] = df["Fur_colour_dominant"].map(fur_color_map)

# fur pattern to integers
fur_patterns = df["Fur_pattern"].unique()
fur_pattern_map = {pattern: i for i, pattern in enumerate(fur_patterns)}
df["Fur_pattern_num"] = df["Fur_pattern"].map(fur_pattern_map)

In [20]:
# form the design matrix
X = df[["Fur_colour_num", "Fur_pattern_num"]].values

# standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# form the target data vector
breed_to_int = {breed: i for i, breed in enumerate(df["Breed"].unique())}
y = df["Breed"].map(breed_to_int).values.astype(int)
num_classes = len(breed_to_int)

# 4. Split Data into Train and Test

In [21]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

# 5. Export Preprocessed Data into CSV

In [22]:
df.to_csv("cat_breeds_clean.csv", index=False)