In [41]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (accuracy_score, 
                            confusion_matrix, 
                            classification_report, 
                            precision_score, 
                            recall_score, 
                            f1_score
                            )


print(sklearn.__version__)


1.5.2


In [23]:
import warnings
warnings.filterwarnings("ignore")

In [24]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
car_evaluation = fetch_ucirepo(id=19) 
  
# data (as pandas dataframes) 
X = car_evaluation.data.features 
y = car_evaluation.data.targets 
  
# metadata 
print(car_evaluation.metadata) 
print(car_evaluation.variables) 


{'uci_id': 19, 'name': 'Car Evaluation', 'repository_url': 'https://archive.ics.uci.edu/dataset/19/car+evaluation', 'data_url': 'https://archive.ics.uci.edu/static/public/19/data.csv', 'abstract': 'Derived from simple hierarchical decision model, this database may be useful for testing constructive induction and structure discovery methods.', 'area': 'Other', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1728, 'num_features': 6, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1988, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5JP48', 'creators': ['Marko Bohanec'], 'intro_paper': {'ID': 249, 'type': 'NATIVE', 'title': 'Knowledge acquisition and explanation for multi-attribute decision making', 'authors': 'M. Bohanec, V. Rajkovič', 'venue': '8th Intl Workshop on Expert Systems and their Applications, 

In [25]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
dtypes: object(6)
memory usage: 81.1+ KB


In [26]:
y['class'].value_counts()

class
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64

Making Multicast calssification to  Binary classification

In [27]:
y['class'] = np.where(y['class']=='unacc', 0, 1) #0='unacc', 1='acc'
y['class'].value_counts(normalize=True)*100

class
0    70.023148
1    29.976852
Name: proportion, dtype: float64

The `stratify` parameter ensures that the training and test sets maintain the same class distribution as the original dataset. 

This is particularly useful for imbalanced datasets, as it prevents either set from being over- or under-represented in any class, which helps in building a more balanced model.

The distribution of y_train and y_test does not follows the same distribution as the original dataset. 

Hence, the model performance is not optimal. The `stratify` parameter helps in building a more balanced model.

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24, stratify=y['class'])

In [29]:
print(y_train['class'].value_counts(normalize=True)*100)
print(y_test['class'].value_counts(normalize=True)*100)

class
0    70.057899
1    29.942101
Name: proportion, dtype: float64
class
0    69.942197
1    30.057803
Name: proportion, dtype: float64


In [30]:
ohe = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')

In [31]:
X_trn_ohe = ohe.fit_transform(X_train)
X_tst_ohe = ohe.transform(X_test)

In [32]:
lr = LogisticRegression()
lr.fit(X_trn_ohe, y_train)

In [33]:
y_pred = lr.predict(X_tst_ohe)

In [34]:
print("R2 score: ", r2_score(y_test, y_pred))
print("Accuracy score: ", accuracy_score(y_test, y_pred))

R2 score:  0.825863530408985
Accuracy score:  0.9633911368015414


| Solver           | Penalty                    | Multinomial Multiclass Support |
|------------------|----------------------------|--------------------------------|
| lbfgs          | `l2`, None                 | Yes                            |
| liblinear      | `l1`, `l2`                 | No                             |
| newtoncg`      | `l2`, None                 | Yes                            |
| newton-cholesky| `l2`, None                 | No                             |
| sag            | `l2`, None                 | Yes                            |
| saga           | `elasticnet`, `l1`, `l2`, None | Yes                       |


In [35]:
#Using solver=liblinear and penalty='l2'
lr = LogisticRegression(solver='liblinear', penalty='l2')
lr.fit(X_trn_ohe, y_train)
y_pred = lr.predict(X_tst_ohe)

print("R2 score: ", r2_score(y_test, y_pred))
print("Accuracy score: ", accuracy_score(y_test, y_pred))

R2 score:  0.8075333757151939
Accuracy score:  0.9595375722543352


In [36]:
#Using solver=lbfgs and penalty='l2'
lr = LogisticRegression(solver='lbfgs', penalty='l2')
lr.fit(X_trn_ohe, y_train)
y_pred = lr.predict(X_tst_ohe)

print("R2 score: ", r2_score(y_test, y_pred))
print("Accuracy score: ", accuracy_score(y_test, y_pred))

R2 score:  0.825863530408985
Accuracy score:  0.9633911368015414


---

In [37]:
ohe = OneHotEncoder(
    drop="first",
    handle_unknown="ignore",
    sparse_output=False
).set_output(transform="pandas")

lr = LogisticRegression()

pipe = Pipeline([
    ("ohe", ohe),
    ("lr", lr)
])

params = {
    "lr__solver": [
        "lbfgs",
        "liblinear",
        "newton-cg",
        "sag",
        "saga"
        "newton-cholesky"
    ]
}

gcv = GridSearchCV(
    estimator=pipe,
    param_grid=params,
    cv=5,
)

gcv.fit(X, y)

print(gcv.best_params_)
print(gcv.best_score_)
 

{'lr__solver': 'newton-cg'}
0.829761246544358


---


- Logistic Regression
    - Multinomial Logistic Regression
    - One-vs-All Logistic Regression
    - Binary Logistic Regression


Multinomial Logistic Regression

In [44]:
# Assuming X contains categorical variables
# First encode categorical variables
label_encoder = LabelEncoder()

# If X is a DataFrame, encode each categorical column
if isinstance(X, pd.DataFrame):
    for column in X.columns:
        if X[column].dtype == 'object':
            X[column] = label_encoder.fit_transform(X[column])
# If X is a Series or single column
elif isinstance(X, pd.Series):
    X = label_encoder.fit_transform(X)

# Encode target variable if it's categorical
if isinstance(y, (pd.Series, pd.DataFrame)):
    y = label_encoder.fit_transform(y)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Compute metrics
accuracy = accuracy_score(y_test, y_pred) 
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy:.4f}") 
print("\nConfusion Matrix:")
print(cm)


Accuracy: 0.6647

Confusion Matrix:
[[201  39]
 [ 77  29]]


One-vs-All Logistic Regression

In [45]:
# Assuming X contains categorical variables
# First encode categorical variables
label_encoder = LabelEncoder()

# If X is a DataFrame, encode each categorical column
if isinstance(X, pd.DataFrame):
    for column in X.columns:
        if X[column].dtype == 'object':
            X[column] = label_encoder.fit_transform(X[column])
# If X is a Series or single column
elif isinstance(X, pd.Series):
    X = label_encoder.fit_transform(X)

# Encode target variable if it's categorical
if isinstance(y, (pd.Series, pd.DataFrame)):
    y = label_encoder.fit_transform(y)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the model
model = LogisticRegression(multi_class='ovr', solver='lbfgs')
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Compute metrics
accuracy = accuracy_score(y_test, y_pred) 
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy:.4f}") 
print("\nConfusion Matrix:")
print(cm)


Accuracy: 0.6647

Confusion Matrix:
[[201  39]
 [ 77  29]]


Multiclass with GridSearchCV

In [46]:
ohe = OneHotEncoder(
    drop="first",
    handle_unknown="ignore",
    sparse_output=False
).set_output(transform="pandas")

lr = LogisticRegression()

pipe = Pipeline([
    ("ohe", ohe),
    ("lr", lr)
])

params = {
    "lr__solver": [
        "lbfgs",
        "liblinear",
        "newton-cg",
        "sag",
        "saga"
        "newton-cholesky"
    ],
    "lr__multi_class": [
        "ovr",
        "multinomial"
    ]
}

gcv = GridSearchCV(
    estimator=pipe,
    param_grid=params,
    cv=5,
)

gcv.fit(X, y)

print(gcv.best_params_)
print(gcv.best_score_)
 

{'lr__multi_class': 'multinomial', 'lr__solver': 'lbfgs'}
0.8303409566892854
