In [1]:
# Import necessary libraries
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report


### Data Preprocessing

In [2]:
# Load the Penguins dataset
df = pd.read_csv('data/penguins.csv')

print(df.shape)
# drop NaNs
df = df.dropna(axis=0, how='any')
df = df.drop('Unnamed: 0', axis=1)


(344, 9)


In [16]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007


In [3]:
# Split the data into features (X) and target variable (y)
X = df.drop(['species', 'island', 'year'], axis=1)
y = df['species']

# Define categorical features
categorical_features = ['sex']

# Create a column transformer with one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'
)

X_processed = preprocessor.fit_transform(X)


In [4]:
X_processed_df = pd.DataFrame(X_processed, columns=preprocessor.get_feature_names_out())

In [10]:
def renaming_fun(x):
    if "remainder__" in x:
        return x.strip('remainder__')
    return x

In [11]:
[renaming_fun(col) for col in X_processed_df.columns]

['cat__sex_female',
 'cat__sex_male',
 'bill_length',
 'bill_depth',
 'flipper_length',
 'body_mass_g']

In [12]:
X_processed_df

Unnamed: 0,cat__sex_female,cat__sex_male,remainder__bill_length_mm,remainder__bill_depth_mm,remainder__flipper_length_mm,remainder__body_mass_g
0,0.0,1.0,39.1,18.7,181.0,3750.0
1,1.0,0.0,39.5,17.4,186.0,3800.0
2,1.0,0.0,40.3,18.0,195.0,3250.0
3,1.0,0.0,36.7,19.3,193.0,3450.0
4,0.0,1.0,39.3,20.6,190.0,3650.0
...,...,...,...,...,...,...
328,0.0,1.0,55.8,19.8,207.0,4000.0
329,1.0,0.0,43.5,18.1,202.0,3400.0
330,0.0,1.0,49.6,18.2,193.0,3775.0
331,0.0,1.0,50.8,19.0,210.0,4100.0


### Model Training

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Create a logistic regression model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)
joblib.dump(model, 'models/model.joblib')
joblib.dump(preprocessor, 'models/preprocessor.joblib')


['models/preprocessor.joblib']

In [8]:

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display the results
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_rep)


Accuracy: 1.00

Classification Report:
               precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00        31
   Chinstrap       1.00      1.00      1.00        18
      Gentoo       1.00      1.00      1.00        18

    accuracy                           1.00        67
   macro avg       1.00      1.00      1.00        67
weighted avg       1.00      1.00      1.00        67

