<h1 style="background-color: rgba(0, 255, 0, 0.5)">LOGISTIC REGRESSION (binary)</h1>

*Train and evaluate logistic regression algorithm in a binary classification problem*

In [1]:
# import packages

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss


In [2]:
# read dataset

breast_cancer = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data",
                           names=["id", "diagnosis", "radius_mean", "texture_mean", "perimeter_mean", "area_mean",
                                  "smoothness_mean", "compactness_mean", "concavity_mean", "concave_points_mean",
                                  "symmetry_mean", "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se",
                                  "area_se", "smoothness_se", "compactness_se", "concavity_se", "concave_points_se",
                                  "symmetry_se", "fractal_dimension_se", "radius_worst", "texture_worst", "perimeter_worst",
                                  "area_worst", "smoothness_worst", "compactness_worst", "concavity_worst",
                                  "concave_points_worst", "symmetry_worst", "fractal_dimension_worst"])
breast_cancer.head()


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
# dataset info

breast_cancer.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave_points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [4]:
# select features to train machine learning model

X = breast_cancer.drop(["id", "diagnosis"], axis=1).values # use all features of the dataset except for "id" and "diagnosis"
y = breast_cancer['diagnosis'].values


In [5]:
# train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


In [6]:
# encoder target variable to make it numerical (0 or 1 because the regression is binary)

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
y_train


array([0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,

In [7]:
# scale and transform matrix of features X

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)


Read this article to understand why we use fit_transform() on training dataset and just transform() on test dataset
https://towardsdatascience.com/what-and-why-behind-fit-transform-vs-transform-in-scikit-learn-78f915cf96fe

In [8]:
# train and evaluate model

CC = [0.1, 1, 10] # set different values for hyperparameter 

for C in CC:
    
    # fit machine learning model using one hyperparameter value at a time
    lr = LogisticRegression(penalty="l2", C=C)
    lr.fit(X_train, y_train)
    
    # prediction on test set
    y_pred_test = lr.predict(X_test)
    y_pred_test_prob = lr.predict_proba(X_test)
    
    # prediction on train set
    y_pred_tr = lr.predict(X_train)
    y_pred_tr_prob = lr.predict_proba(X_train)
    
    # evaluate prediction on test set
    acc = accuracy_score(y_test, y_pred_test)
    lloss = log_loss(y_test, y_pred_test_prob)
    
    # evaluate prediction on training set
    acc_tr = accuracy_score(y_train, y_pred_tr)
    lloss_tr = log_loss(y_train, y_pred_tr_prob)
    
    # print results
    print("Alpha = " + str(1/C))
    print("Accuracy test = %.2f; Accuracy train %.2f" % (acc, acc_tr))
    print("Log loss test = %.2f; Log loss train %.2f" % (lloss, lloss_tr))
    print("")


Alpha = 10.0
Accuracy test = 0.97; Accuracy train 0.98
Log loss test = 0.10; Log loss train 0.09

Alpha = 1.0
Accuracy test = 0.98; Accuracy train 0.99
Log loss test = 0.09; Log loss train 0.05

Alpha = 0.1
Accuracy test = 0.95; Accuracy train 0.99
Log loss test = 0.21; Log loss train 0.03

