## Imports

In [39]:
# basic imports for libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# model imports
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor


# evaluation and training imports
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, average_precision_score, mean_squared_error, r2_score, root_mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

# preprocessing imports
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Weeklong Project: Supervised Learning Model
Welcome to the starter file for your weeklong project! This is an opportunity for you to work independently with a supervised learning model. There will not be much guidance in this file, so you should rely on [scikit-learn documentation](https://scikit-learn.org/stable/api/index.html) and the other files we've worked on this week for help. Feel free to add new cells and imports as needed.
## Exploratory Data Analysis
Get to know your data a little better, and start planning out preprocessing steps.

In [40]:
data_root = "https://github.com/rachelkd/geering-up/raw/refs/heads/main/"

In [41]:
# TODO: Uncomment one of these lines to select a dataset
poke_data = pd.read_csv(data_root + "data/pokemon.csv")
# cereal_data = pd.read_csv(data_root + "data/cereal.csv")
# penguin_data = pd.read_csv(data_root + "data/penguins_size.csv")
poke_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 41 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   abilities          801 non-null    object 
 1   against_bug        801 non-null    float64
 2   against_dark       801 non-null    float64
 3   against_dragon     801 non-null    float64
 4   against_electric   801 non-null    float64
 5   against_fairy      801 non-null    float64
 6   against_fight      801 non-null    float64
 7   against_fire       801 non-null    float64
 8   against_flying     801 non-null    float64
 9   against_ghost      801 non-null    float64
 10  against_grass      801 non-null    float64
 11  against_ground     801 non-null    float64
 12  against_ice        801 non-null    float64
 13  against_normal     801 non-null    float64
 14  against_poison     801 non-null    float64
 15  against_psychic    801 non-null    float64
 16  against_rock       801 non

## Splitting Data
Identify your features and target, and split your data into training and testing sets.

In [42]:
# Select features
poke_data.drop(columns=["generation", "japanese_name", "capture_rate", "base_egg_steps"])

feature_columns = [col for col in poke_data.columns if not col.startswith('against_')]
features = poke_data[feature_columns]
# Select target columns
target_columns = [col for col in poke_data.columns if col.startswith('against_')]
target = poke_data[target_columns]

print("Features shape:", features.shape)
print("Target shape:", target.shape)

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

print("Features train shape:", features_train.shape)
print("Features test shape:", features_test.shape)
print("Target train shape:", target_train.shape)
print("Target test shape:", target_test.shape)

Features shape: (801, 23)
Target shape: (801, 18)
Features train shape: (640, 23)
Features test shape: (161, 23)
Target train shape: (640, 18)
Target test shape: (161, 18)


## Preprocessing
Preprocess your data, keeping in mind the golden rule.

In [43]:
# Identify categorical and numerical columns
categorical_features = features.select_dtypes(include=['object', 'category']).columns
numerical_features = features.select_dtypes(include=np.number).columns

print("Categorical features:", categorical_features)
print("Numerical features:", numerical_features)

# Create transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a column transformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Fit the preprocessor on the training data
features_train_processed = preprocessor.fit_transform(features_train)

# Transform the test data
features_test_processed = preprocessor.transform(features_test)

print("Processed features train shape:", features_train_processed.shape)
print("Processed features test shape:", features_test_processed.shape)

Categorical features: Index(['abilities', 'capture_rate', 'classfication', 'japanese_name', 'name',
       'type1', 'type2'],
      dtype='object')
Numerical features: Index(['attack', 'base_egg_steps', 'base_happiness', 'base_total', 'defense',
       'experience_growth', 'height_m', 'hp', 'percentage_male',
       'pokedex_number', 'sp_attack', 'sp_defense', 'speed', 'weight_kg',
       'generation', 'is_legendary'],
      dtype='object')
Processed features train shape: (640, 2295)
Processed features test shape: (161, 2295)


## Training and Testing
Choose an evaluation metric to score your models on. Then, train and optimize the hyperparameters for at least 3 different models and see which one works best.

If you've chosen a **classification problem**, your metric could be:
- Accuracy (default)
- Precision
- Recall
- F1 score
- ROC AUC
- etc...

Your models could be:
- `SVC` (linear or RBF)
- `KNeighborsClassifier`
- `DecisionTreeClassifier`
- `LogisticRegression`
- `RandomForestClassifier`
- etc...

If you've chosen a **regression problem**, your metric could be:
- R^2 score (default)
- Mean squared error (MSE) /root of mean squared error (RMSE)
- Mean absolute percentage error (MAPE)
- etc...

Your models could be:
- `SVR` (linear and/or RBF)
- `KNeighborsRegressor`
- `DecisionTreeRegressor`
- `Ridge` (linear model)
- `RandomForestRegressor`
- etc...

In [44]:
# Initialize multi-output regression models
# DecisionTreeRegressor and RandomForestRegressor support multi-output natively
dt_regressor = DecisionTreeRegressor(random_state=42)
rf_regressor = RandomForestRegressor(random_state=42)

# Ridge and KNeighborsRegressor do not natively support multi-output, so we wrap them
ridge_regressor = MultiOutputRegressor(Ridge())
knn_regressor = MultiOutputRegressor(KNeighborsRegressor())


models = {
    "Decision Tree Regressor": dt_regressor,
    "Random Forest Regressor": rf_regressor,
    "Ridge Regressor": ridge_regressor,
    "KNeighbors Regressor": knn_regressor
}

# Set the name of the model you want to train
selected_model_name = "Ridge Regressor" # Changed to Random Forest Regressor

# Define hyperparameters for the selected model
# This is an example for RandomForestRegressor
hyperparameters = {
    'estimator__alpha': 0.1, # Corrected parameter name
}

# # Select model
# selected_model_name = "KNeighbors Regressor"

# # Define hyperparameters
# hyperparameters = {
#     'estimator__n_neighbors': 7,
#     'estimator__weights': 'distance'
# }


# Train the selected model
model = models[selected_model_name]
print(f"Training {selected_model_name}...")
model.set_params(**hyperparameters)
model.fit(features_train_processed, target_train)
print("Training complete.")

Training Ridge Regressor...
Training complete.


## Final Report
Report your best model's metric. How do you feel about this score? Is there anything you could change?

If you have time, prepare a presentation or report discussing:
- Which dataset you picked
- What preprocessing you had to do
- Which columns you dropped, if any
- Which 3 models you chose
- Which evaluation metric(s) you chose
- What hyperparameter tuning was like
- Which model performed the best
- Your opinion on the best model's predictions

In [45]:
# Evaluate the models using R-squared as the metric
# Using model.score() which returns R-squared for regressors
from sklearn.metrics import r2_score # Still need this for the explicit r2_score calculation in case model.score() isn't available or for clarity


# Only evaluate the selected model
if selected_model_name in models:
    model = models[selected_model_name]
    # Use the score method, which returns R-squared for regressors
    r2 = model.score(features_test_processed, target_test)
else:
    print(f"Model '{selected_model_name}' not found in the models dictionary.")


# Print the results
print(f"\nModel Evaluation Results: {r2}")



Model Evaluation Results: 0.8742483297029524


## Inspecting Predictions
Inspect the actual and predicted values for a few test samples to understand the model's performance.

In [46]:
# Make predictions on the test set
predictions = model.predict(features_test_processed)

# Select the first few test samples
n_samples_to_show = 3

# Display the actual target values and the predictions
print("Actual target values for the first", n_samples_to_show, "test samples:")
display(target_test.head(n_samples_to_show))

print("\nPredicted values for the first", n_samples_to_show, "test samples:")
# We use a DataFrame for predictions to have the same structure as the target
predictions_df = pd.DataFrame(predictions, columns=target_test.columns, index=target_test.index)
display(predictions_df.head(n_samples_to_show))

Actual target values for the first 3 test samples:


Unnamed: 0,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,against_grass,against_ground,against_ice,against_normal,against_poison,against_psychic,against_rock,against_steel,against_water
697,1.0,1.0,1.0,1.0,1.0,4.0,1.0,0.5,1.0,2.0,2.0,0.5,0.5,0.5,1.0,2.0,4.0,2.0
668,0.5,0.5,0.0,1.0,1.0,0.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0
63,2.0,2.0,1.0,1.0,1.0,0.5,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,0.5,1.0,1.0,1.0



Predicted values for the first 3 test samples:


Unnamed: 0,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,against_grass,against_ground,against_ice,against_normal,against_poison,against_psychic,against_rock,against_steel,against_water
697,1.036161,1.00512,1.000611,1.017508,1.012835,3.209468,1.002826,0.523258,1.006972,1.957249,1.940946,0.628245,0.519296,0.550003,0.994134,1.786701,3.180507,1.955635
668,0.638915,0.587413,0.158645,1.03374,1.038406,0.613704,1.041057,1.05217,0.978817,1.03261,1.061498,0.924116,0.987913,1.821106,0.969216,0.928438,1.842413,1.006634
63,1.812588,1.881477,0.99542,1.201309,1.057235,0.358759,0.976994,1.035365,1.925786,0.939259,0.847908,1.172729,0.986532,0.988934,0.563401,1.133518,1.004069,1.026302
