In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score,precision_score,confusion_matrix,confusion_matrix


In [2]:
# Load the Iris dataset
from sklearn.datasets import load_iris

iris = load_iris()
data = pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['target'])


In [3]:
print(data.describe())

       sepal length (cm)  sepal width (cm)  petal length (cm)   
count         150.000000        150.000000         150.000000  \
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.000000           4.350000   
75%             6.400000          3.300000           5.100000   
max             7.900000          4.400000           6.900000   

       petal width (cm)      target  
count        150.000000  150.000000  
mean           1.199333    1.000000  
std            0.762238    0.819232  
min            0.100000    0.000000  
25%            0.300000    0.000000  
50%            1.300000    1.000000  
75%            1.800000    2.000000  
max            2.500000    2.000000  


In [4]:
print(data)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)   
0                  5.1               3.5                1.4               0.2  \
1                  4.9               3.0                1.4               0.2   
2                  4.7               3.2                1.3               0.2   
3                  4.6               3.1                1.5               0.2   
4                  5.0               3.6                1.4               0.2   
..                 ...               ...                ...               ...   
145                6.7               3.0                5.2               2.3   
146                6.3               2.5                5.0               1.9   
147                6.5               3.0                5.2               2.0   
148                6.2               3.4                5.4               2.3   
149                5.9               3.0                5.1               1.8   

     target  
0       0.0  

In [5]:

# Create a binary classification problem: 'virginica' vs. 'non-virginica'
data['target'] = np.where(data['target'] == 2, 'virginica', 'non-virginica')


In [7]:
# data with virginica and non-virginica
print(data)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)   
0                  5.1               3.5                1.4               0.2  \
1                  4.9               3.0                1.4               0.2   
2                  4.7               3.2                1.3               0.2   
3                  4.6               3.1                1.5               0.2   
4                  5.0               3.6                1.4               0.2   
..                 ...               ...                ...               ...   
145                6.7               3.0                5.2               2.3   
146                6.3               2.5                5.0               1.9   
147                6.5               3.0                5.2               2.0   
148                6.2               3.4                5.4               2.3   
149                5.9               3.0                5.1               1.8   

            target  
0    n

In [28]:
# Split the data into features (X) and labels (y)
X = iris.data
y = iris.target_names[iris.target] == 'virginica' 


In [29]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
y_test

array([False, False,  True, False, False, False, False,  True, False,
       False,  True, False, False, False, False, False,  True, False,
       False,  True, False,  True, False,  True,  True,  True,  True,
        True, False, False])

In [31]:
# fiting the model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

In [32]:
# Make predictions on the test set
y_pred = log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)[:, 1]

y_pred

array([False, False,  True, False, False, False, False,  True, False,
       False,  True, False, False, False, False, False,  True, False,
       False,  True, False,  True, False,  True,  True,  True,  True,
        True, False, False])

In order to evaluate logistic regression we use confusion matrix.



In [33]:
cnf_matrix = confusion_matrix(y_test, y_pred)
print(cnf_matrix)

[[19  0]
 [ 0 11]]


There are no false positive and false negatives only ture positive 
and true negative

Cross validation



In [34]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(log_reg, X_train, y_train, cv=10, scoring='accuracy')
print(scores)
scores.mean()

[1.         1.         0.91666667 1.         0.91666667 0.83333333
 1.         1.         1.         0.91666667]


0.9583333333333333

Precision and recall



In [37]:
precision = precision_score(y_test, y_pred,)
recall = recall_score(y_test, y_pred,)


print(f"The precision score is {precision} and recall score is {recall}")

The precision score is 1.0 and recall score is 1.0


From the above we found that both precision and recall are equal to 1.
Which means that that each instance your model identified as "Virginica" was a true positive. 

This shows that every actual "Virginica" iris flower in the dataset can be accurately identified by our model. False negative results don't exist. 

