# ***Logistic Regression***

***Importing the necessary libraries***

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

***Now, we are importing the Iris dataset into our system***

In [25]:
from sklearn.datasets import load_iris
data = load_iris()
data

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [26]:
df = pd.DataFrame(data.data, columns =data.feature_names)
df.head(2)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2


In [27]:
df["Target"] = pd.DataFrame(data.target)

***Here, we are going to take only two targets because logistic regression is a binary classification model but in our dataset there are three unique classes i.e. 0,1,2. So, we cannot use logistic regression in our case but in order to do so, we are going to drop one class i.e. 0 and will be using other two classes for the training and testing of our model***

In [29]:
df = df[(df["Target"] == 1) | (df["Target"] == 2)]

***We can see below that we are not using the class 0 in our data***

In [30]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Target
50,7.0,3.2,4.7,1.4,1
51,6.4,3.2,4.5,1.5,1
52,6.9,3.1,4.9,1.5,1
53,5.5,2.3,4.0,1.3,1
54,6.5,2.8,4.6,1.5,1
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


***We are now going to convert our dataset into dependent and independent variables***

In [37]:
X = df.iloc[:, :-1]
y = df.iloc[:,-1]

***Now, we are going to convert the dependent and independent variables into training and testing data. So that we can train the model on the training data and test the created model on the test data***

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)

In [40]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

***We now want to hypertune the model to train on the training data using different parameters and find the best parameters with the best score***

In [41]:
from sklearn.model_selection import GridSearchCV
parameters = {'penalty':['l1','l2','elasticnet'],'C':[1,2,3,4,5,6,10,20,30,40,50],'max_iter':[100,200,300]}

In [44]:
regressor = GridSearchCV(classifier, param_grid=parameters, scoring="accuracy", cv=5)
regressor.fit(X_train, y_train)

330 fits failed out of a total of 495.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
165 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Anaconda\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Anaconda\Lib\site-packages\sklearn\linear_model\_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



***With the help of GridSearchCV we found the best parameters for this classifier***

In [45]:
print(regressor.best_params_)

{'C': 1, 'max_iter': 100, 'penalty': 'l2'}


In [46]:
print(regressor.best_score_)

0.9714285714285715


***We can now predict the dependent variable***

In [48]:
y_hat = regressor.predict(X_test)
y_hat

array([2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2,
       2, 2, 1, 2, 2, 1, 2, 2])

***Score of the model can be obtained using the accuracy score from performance metrics*** 

In [49]:
from sklearn.metrics import accuracy_score, classification_report
score = accuracy_score(y_hat, y_test)
print(score)

0.9333333333333333


In [51]:
print(classification_report(y_hat, y_test))

              precision    recall  f1-score   support

           1       0.86      1.00      0.92        12
           2       1.00      0.89      0.94        18

    accuracy                           0.93        30
   macro avg       0.93      0.94      0.93        30
weighted avg       0.94      0.93      0.93        30

