In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.formula.api as smf
import sklearn.metrics
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
penguins = pd.read_csv('DATA/penguins.csv')
penguins = penguins.drop(columns=penguins.columns[0]).dropna()
penguins.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 333 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    object 
 1   island             333 non-null    object 
 2   bill_length_mm     333 non-null    float64
 3   bill_depth_mm      333 non-null    float64
 4   flipper_length_mm  333 non-null    float64
 5   body_mass_g        333 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 20.8+ KB


In [3]:
# this is equivalent to 
#formula = 'species ~ C(island) + bill_length_mm + bill_depth_mm + flipper_length_mm + body_mass_g  + C(sex)'

y, X = penguins.species, penguins.drop(columns='species')
X = pd.get_dummies(X, drop_first=True)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [5]:
# run the model and then make the predictions as before

In [6]:
log_model = LogisticRegression(random_state=42, n_jobs=-1).fit(X_train, y_train)

In [7]:
y_pred_test = log_model.predict(X_test)

In [8]:
confusion_matrix(y_test, y_pred_test)

array([[35,  1,  1],
       [ 0, 17,  0],
       [ 0,  0, 30]])

In [9]:
log_model.classes_

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [10]:
log_model.intercept_

array([ 0.02598165, -0.00759172, -0.01838993])

In [11]:
log_model.coef_

array([[-7.90237177e-01,  1.09031213e+00,  8.76863229e-02,
         2.92120274e-06, -5.43003067e-02,  1.29613890e-01,
         1.02151060e-01],
       [ 1.27363295e+00,  5.14923671e-02, -1.60121918e-01,
        -6.33140709e-03,  1.54749678e-01, -9.19881705e-02,
         2.83484267e-02],
       [-4.83395776e-01, -1.14180450e+00,  7.24355954e-02,
         6.32848589e-03, -1.00449371e-01, -3.76257196e-02,
        -1.30499487e-01]])

In [12]:
# let's make a DIFFERENT model.

In [13]:
# this is equivalent to 
#formula = 'species ~ C(island) + bill_length_mm + bill_depth_mm + body_mass_g + C(sex)'

y2, X2 = penguins.species, penguins.drop(columns=['species','flipper_length_mm'] )

X2 = pd.get_dummies(X2, drop_first=True)

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, stratify=y, random_state=42 )

In [14]:
log_model2 = LogisticRegression(random_state=42, n_jobs=-1).fit(X_train2, y_train2)

In [15]:
y_pred_test2 = log_model2.predict(X_test2)
confusion_matrix(y_test2, y_pred_test2)

array([[36,  0,  1],
       [ 0, 17,  0],
       [ 0,  0, 30]])