In [113]:
import pandas as pd
import numpy as np

In [119]:
# 1. LOAD DATA - KEEP SEPARATE

df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")

Train shape: (26729, 10)
Test shape: (11456, 8)


In [115]:
def preprocess_data(df, is_train=True):
    """
    Apply all preprocessing steps to dataset
    """
    df = df.copy()
    
    # Convert datetime
    df['DateTime'] = pd.to_datetime(df['DateTime'], errors='raise')
    
    # Standardize age to days
    def standardize_age(age):
        if pd.isna(age):
            return None
        
        value, period = age.strip().split()
        value = int(value)
        
        if 'week' in period:
            return value * 7
        elif 'month' in period:
            return value * 30
        elif 'year' in period:
            return value * 365
        else:
            return None
    
    df['AgeuponOutcome'] = df['AgeuponOutcome'].apply(standardize_age)
    
    # Is Mix flag
    df['isMix'] = df['Breed'].str.contains('Mix', na=False)
    
    # Clean breed (remove Mix)
    df['Breed'] = df['Breed'].str.replace('Mix', '').str.strip()
    
    # Split breed and color
    df[['Breed1', 'Breed2', 'Breed3']] = df['Breed'].str.split('/', expand=True)
    df[['Color1', 'Color2']] = df['Color'].str.split('/', expand=True)
    
    # Temporal features
    df['hour'] = df['DateTime'].dt.hour
    df['weekday'] = df['DateTime'].dt.weekday
    df['month'] = df['DateTime'].dt.month
    
    # Time of day
    bins = [0, 5, 11, 13, 17, 21, 23]
    labels = ['night', 'morning', 'midday', 'afternoon', 'evening', 'late night']
    df['time_of_day'] = pd.cut(df['hour'], bins=bins, labels=labels, include_lowest=True)
    
    # Has name
    df['has_name'] = df['Name'].notna().map({True: 'yes', False: 'no'})
    
    # Separate sex and status
    df[['status', 'sex']] = df['SexuponOutcome'].str.split(' ', expand=True)
    df[['status', 'sex']] = df[['status', 'sex']].fillna('Unknown')
    
    # Life stage
    df['life_stage'] = np.where(df['AgeuponOutcome'] > 365, 'adult', 'baby')
    
    # Select final columns
    if is_train:
        columns_to_keep = ['AnimalType', 'AgeuponOutcome', 'isMix', 'Breed1', 'Color1', 
                          'hour', 'weekday', 'month', 'time_of_day', 'has_name', 
                          'status', 'sex', 'life_stage', 'OutcomeType']
    else:
        columns_to_keep = ['ID', 'AnimalType', 'AgeuponOutcome', 'isMix', 'Breed1', 'Color1', 
                          'hour', 'weekday', 'month', 'time_of_day', 'has_name', 
                          'status', 'sex', 'life_stage']
    
    df = df[columns_to_keep]
    
    # Drop missing values in age
    if is_train:
        df = df.dropna(subset=['AgeuponOutcome'])
    else:
        # For test, fill missing ages with median from training
        df['AgeuponOutcome'].fillna(df['AgeuponOutcome'].median(), inplace=True)
    
    return df

In [116]:
# 3. PREPROCESS TRAIN DATA
train_processed = preprocess_data(df_train, is_train=True)
print(f"Train processed shape: {train_processed.shape}")
print(f"Missing values: {train_processed.isna().sum().sum()}")

Train processed shape: (26313, 14)
Missing values: 0


In [117]:
# 4. PREPARE FEATURES AND TARGET
# Separate features and target
X = train_processed.drop('OutcomeType', axis=1)
y = train_processed['OutcomeType']

print(f"Features shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")

Features shape: (26313, 13)
Target distribution:
OutcomeType
Adoption           10769
Transfer            9043
Return_to_owner     4780
Euthanasia          1546
Died                 175
Name: count, dtype: int64


In [120]:
# 5. ENCODE CATEGORICAL VARIABLES
from sklearn.preprocessing import LabelEncoder

# Store encoders for later use on test set
encoders = {}
categorical_cols = ['AnimalType', 'isMix', 'Breed1', 'Color1', 'time_of_day', 
                   'has_name', 'status', 'sex', 'life_stage']


# Encode categorical variables
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    encoders[col] = le

# Encode target
target_encoder = LabelEncoder()
y_encoded = target_encoder.fit_transform(y)

print("Encoding completed!")
print(f"Feature dtypes:\n{X.dtypes}")

Encoding completed!
Feature dtypes:
AnimalType          int64
AgeuponOutcome    float64
isMix               int64
Breed1              int64
Color1              int64
hour                int32
weekday             int32
month               int32
time_of_day         int64
has_name            int64
status              int64
sex                 int64
life_stage          int64
dtype: object
