## Imports

In [67]:
# basic imports for libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# model imports
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor


# evaluation and training imports
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, average_precision_score, mean_squared_error, r2_score, root_mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

# preprocessing imports
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Weeklong Project: Supervised Learning Model
Welcome to the starter file for your weeklong project! This is an opportunity for you to work independently with a supervised learning model. There will not be much guidance in this file, so you should rely on [scikit-learn documentation](https://scikit-learn.org/stable/api/index.html) and the other files we've worked on this week for help. Feel free to add new cells and imports as needed.
## Exploratory Data Analysis
Get to know your data a little better, and start planning out preprocessing steps.

In [68]:
data_root = "https://github.com/rachelkd/geering-up/raw/refs/heads/main/"

In [69]:
# TODO: Uncomment one of these lines to select a dataset
poke_data = pd.read_csv(data_root + "data/pokemon.csv")
# cereal_data = pd.read_csv(data_root + "data/cereal.csv")
# penguin_data = pd.read_csv(data_root + "data/penguins_size.csv")
poke_data


Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.00,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,1,65,65,45,grass,poison,6.9,1,0
1,"['Overgrow', 'Chlorophyll']",1.00,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,2,80,80,60,grass,poison,13.0,1,0
2,"['Overgrow', 'Chlorophyll']",1.00,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,3,122,120,80,grass,poison,100.0,1,0
3,"['Blaze', 'Solar Power']",0.50,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,4,60,50,65,fire,,8.5,1,0
4,"['Blaze', 'Solar Power']",0.50,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,5,80,65,80,fire,,19.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
796,['Beast Boost'],0.25,1.0,0.5,2.0,0.5,1.0,2.0,0.5,1.0,...,,797,107,101,61,steel,flying,999.9,7,1
797,['Beast Boost'],1.00,1.0,0.5,0.5,0.5,2.0,4.0,1.0,1.0,...,,798,59,31,109,grass,steel,0.1,7,1
798,['Beast Boost'],2.00,0.5,2.0,0.5,4.0,2.0,0.5,1.0,0.5,...,,799,97,53,43,dark,dragon,888.0,7,1
799,['Prism Armor'],2.00,2.0,1.0,1.0,1.0,0.5,1.0,1.0,2.0,...,,800,127,89,79,psychic,,230.0,7,1


## Splitting Data
Identify your features and target, and split your data into training and testing sets.

In [70]:
# Select features
feature_columns = [col for col in poke_data.columns if not col.startswith('against_')]
features = poke_data[feature_columns]
# Select target columns
target_columns = [col for col in poke_data.columns if col.startswith('against_')]
target = poke_data[target_columns]

print("Features shape:", features.shape)
print("Target shape:", target.shape)

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

print("Features train shape:", features_train.shape)
print("Features test shape:", features_test.shape)
print("Target train shape:", target_train.shape)
print("Target test shape:", target_test.shape)

Features shape: (801, 23)
Target shape: (801, 18)
Features train shape: (640, 23)
Features test shape: (161, 23)
Target train shape: (640, 18)
Target test shape: (161, 18)


## Preprocessing
Preprocess your data, keeping in mind the golden rule.

In [71]:
# Identify categorical and numerical columns
categorical_features = features.select_dtypes(include=['object', 'category']).columns
numerical_features = features.select_dtypes(include=np.number).columns

print("Categorical features:", categorical_features)
print("Numerical features:", numerical_features)

# Create transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a column transformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Fit the preprocessor on the training data
features_train_processed = preprocessor.fit_transform(features_train)

# Transform the test data
features_test_processed = preprocessor.transform(features_test)

print("Processed features train shape:", features_train_processed.shape)
print("Processed features test shape:", features_test_processed.shape)

Categorical features: Index(['abilities', 'capture_rate', 'classfication', 'japanese_name', 'name',
       'type1', 'type2'],
      dtype='object')
Numerical features: Index(['attack', 'base_egg_steps', 'base_happiness', 'base_total', 'defense',
       'experience_growth', 'height_m', 'hp', 'percentage_male',
       'pokedex_number', 'sp_attack', 'sp_defense', 'speed', 'weight_kg',
       'generation', 'is_legendary'],
      dtype='object')
Processed features train shape: (640, 2295)
Processed features test shape: (161, 2295)


## Training and Testing
Choose an evaluation metric to score your models on. Then, train and optimize the hyperparameters for at least 3 different models and see which one works best.

If you've chosen a **classification problem**, your metric could be:
- Accuracy (default)
- Precision
- Recall
- F1 score
- ROC AUC
- etc...

Your models could be:
- `SVC` (linear or RBF)
- `KNeighborsClassifier`
- `DecisionTreeClassifier`
- `LogisticRegression`
- `RandomForestClassifier`
- etc...

If you've chosen a **regression problem**, your metric could be:
- R^2 score (default)
- Mean squared error (MSE) /root of mean squared error (RMSE)
- Mean absolute percentage error (MAPE)
- etc...

Your models could be:
- `SVR` (linear and/or RBF)
- `KNeighborsRegressor`
- `DecisionTreeRegressor`
- `Ridge` (linear model)
- `RandomForestRegressor`
- etc...

In [72]:
# Initialize multi-output regression models
# DecisionTreeRegressor and RandomForestRegressor support multi-output natively
dt_regressor = DecisionTreeRegressor(random_state=42)
rf_regressor = RandomForestRegressor(random_state=42)

# Ridge and KNeighborsRegressor do not natively support multi-output, so we wrap them
ridge_regressor = MultiOutputRegressor(Ridge())
knn_regressor = MultiOutputRegressor(KNeighborsRegressor())


models = {
    "Decision Tree Regressor": dt_regressor,
    "Random Forest Regressor": rf_regressor,
    "Ridge Regressor": ridge_regressor,
    "KNeighbors Regressor": knn_regressor
}

# Set the name of the model you want to train
selected_model_name = "Random Forest Regressor" # Changed to Random Forest Regressor

# Define hyperparameters for the selected model
# This is an example for RandomForestRegressor
hyperparameters = {
    'n_estimators': 200, # Number of trees
    'max_depth': 20, # Maximum depth of trees
    'min_samples_split': 5, # Minimum samples required to split
    'min_samples_leaf': 2, # Minimum samples at a leaf node
    'max_features': 'sqrt' # Number of features to consider for splits
}

# # Select model
# selected_model_name = "KNeighbors Regressor"

# # Define hyperparameters
# hyperparameters = {
#     'estimator__n_neighbors': 7,
#     'estimator__weights': 'distance'
# }


# Train the selected model
model = models[selected_model_name]
print(f"Training {selected_model_name}...")
model.set_params(**hyperparameters)
model.fit(features_train_processed, target_train)
print("Training complete.")


Training KNeighbors Regressor...
Training complete.


## Final Report
Report your best model's metric. How do you feel about this score? Is there anything you could change?

If you have time, prepare a presentation or report discussing:
- Which dataset you picked
- What preprocessing you had to do
- Which columns you dropped, if any
- Which 3 models you chose
- Which evaluation metric(s) you chose
- What hyperparameter tuning was like
- Which model performed the best
- Your opinion on the best model's predictions

In [73]:
# Evaluate the models using R-squared as the metric
# Using model.score() which returns R-squared for regressors
from sklearn.metrics import r2_score # Still need this for the explicit r2_score calculation in case model.score() isn't available or for clarity


# Only evaluate the selected model
if selected_model_name in models:
    model = models[selected_model_name]
    # Use the score method, which returns R-squared for regressors
    r2 = model.score(features_test_processed, target_test)
else:
    print(f"Model '{selected_model_name}' not found in the models dictionary.")


# Print the results
print(f"\nModel Evaluation Results: {r2}")



Model Evaluation Results: 0.1941193085124531


## Inspecting Predictions
Inspect the actual and predicted values for a few test samples to understand the model's performance.

In [74]:
# Make predictions on the test set
predictions = model.predict(features_test_processed)

# Select the first few test samples
n_samples_to_show = 3

# Display the actual target values and the predictions
print("Actual target values for the first", n_samples_to_show, "test samples:")
display(target_test.head(n_samples_to_show))

print("\nPredicted values for the first", n_samples_to_show, "test samples:")
# We use a DataFrame for predictions to have the same structure as the target
predictions_df = pd.DataFrame(predictions, columns=target_test.columns, index=target_test.index)
display(predictions_df.head(n_samples_to_show))

Actual target values for the first 3 test samples:


Unnamed: 0,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,against_grass,against_ground,against_ice,against_normal,against_poison,against_psychic,against_rock,against_steel,against_water
697,1.0,1.0,1.0,1.0,1.0,4.0,1.0,0.5,1.0,2.0,2.0,0.5,0.5,0.5,1.0,2.0,4.0,2.0
668,0.5,0.5,0.0,1.0,1.0,0.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0
63,2.0,2.0,1.0,1.0,1.0,0.5,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,0.5,1.0,1.0,1.0



Predicted values for the first 3 test samples:


Unnamed: 0,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,against_grass,against_ground,against_ice,against_normal,against_poison,against_psychic,against_rock,against_steel,against_water
697,0.893827,0.929416,1.0,1.358192,0.856832,1.0,0.714305,1.284037,1.0,1.14444,1.356068,0.714305,1.0,1.14287,1.141167,1.286336,0.571435,1.141255
668,1.604395,1.225448,0.870597,0.776943,0.932834,0.924556,1.378947,1.446114,1.155817,0.709777,0.911276,1.378947,1.0,1.575517,0.854925,1.134333,1.062237,0.911276
63,1.438366,1.603648,1.0,0.868288,0.867326,0.631965,1.066454,1.067663,1.603648,0.800868,1.329397,1.066454,0.867579,1.033601,0.896807,1.132927,0.868283,1.066468
