In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, KFold

  from pandas.core.computation.check import NUMEXPR_INSTALLED


## Logistic Regression

#### Training

In [2]:
df = pd.read_csv('Iris.csv')

In [3]:
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [4]:
model = LogisticRegression()
X = df.drop(['Id','Species'],axis=1)
y = df['Species']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [7]:
model.fit(X_train,y_train)

LogisticRegression()

#### Model hyper-paremeters

In [8]:
# trees_depth = []
# for x in model.estimators_:
#     trees_depth.append(x.get_depth())
# np.max(trees_depth)

#### Validation

In [9]:
y_val_pred = model.predict(X_val)

In [10]:
# confusion matrix
confusion_matrix(y_val, y_val_pred)

array([[ 8,  0,  0],
       [ 0, 10,  1],
       [ 0,  0,  5]], dtype=int64)

In [11]:
# model's accuracy
accuracy_score(y_val, y_val_pred)

0.9583333333333334

In [12]:
# f1 score for unbalanced classes
f1_score(y_val, y_val_pred, average='weighted')

0.9592352092352092

#### Cross-validation

In [13]:
kf = KFold(n_splits=5, shuffle=True)
next(kf.split(X))

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         15,  19,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  32,
         33,  35,  36,  38,  40,  42,  44,  45,  46,  47,  48,  49,  50,
         52,  53,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  68,
         69,  70,  71,  72,  74,  75,  76,  77,  78,  79,  80,  81,  82,
         83,  84,  85,  86,  87,  88,  89,  91,  92,  93,  94,  96,  97,
         98,  99, 100, 103, 104, 105, 106, 107, 109, 110, 111, 112, 114,
        115, 116, 117, 118, 120, 121, 124, 125, 126, 127, 128, 129, 130,
        131, 133, 134, 135, 136, 137, 138, 139, 141, 142, 143, 144, 145,
        146, 148, 149]),
 array([ 13,  14,  16,  17,  18,  20,  31,  34,  37,  39,  41,  43,  51,
         54,  55,  56,  67,  73,  90,  95, 101, 102, 108, 113, 119, 122,
        123, 132, 140, 147]))

In [17]:
batch_true = []
batch_pred = []
for train_idx, test_idx in kf.split(X):
    X_train_batch = X.iloc[train_idx]
    y_train_batch = y.iloc[train_idx]
    X_test_batch = X.iloc[test_idx]
    y_test_batch = y.iloc[test_idx]
    model_batch = LogisticRegression()
    model_batch.fit(X_train_batch, y_train_batch)
    batch_true.append(y_test_batch)
    batch_pred.append(pd.Series(model_batch.predict(X_test_batch), index = test_idx))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [18]:
y_pred = pd.concat(batch_pred).sort_index()
y_pred

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Length: 150, dtype: object

In [19]:
y_true = pd.concat(batch_true).sort_index()
y_true

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: Species, Length: 150, dtype: object

In [20]:
confusion_matrix(y_true,y_pred)

array([[50,  0,  0],
       [ 0, 47,  3],
       [ 0,  3, 47]], dtype=int64)

#### Test

In [21]:
y_test_pred = model.predict(X_test)

In [22]:
confusion_matrix(y_test, y_test_pred)

array([[10,  0,  0],
       [ 0,  9,  0],
       [ 0,  0, 11]], dtype=int64)

In [23]:
accuracy_score(y_test, y_test_pred)

1.0

In [24]:
f1_score(y_test, y_test_pred, average='weighted')

1.0