## Data collection and pre-processing

In [19]:
import pandas as pd
iris_data = pd.read_csv("data/Iris.csv")

In [20]:
iris_data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [21]:
iris_data.shape

(150, 6)

In [22]:
# getting some info about the data
iris_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [23]:
# check the number of missing values in each column
iris_data.isnull().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [24]:
# droping the id column
iris_data=iris_data.drop(columns="Id",axis=1)

In [25]:
iris_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SepalLengthCm  150 non-null    float64
 1   SepalWidthCm   150 non-null    float64
 2   PetalLengthCm  150 non-null    float64
 3   PetalWidthCm   150 non-null    float64
 4   Species        150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [26]:
# converting data to have numerical value for Species
species_mapping = {"Iris-setosa": 0, "Iris-versicolor": 1, "Iris-virginica": 2}
iris_data["Species"] = iris_data["Species"].replace(species_mapping).astype(int)

  iris_data["Species"] = iris_data["Species"].replace(species_mapping).astype(int)


In [27]:
iris_data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [28]:
iris_data.tail()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2
149,5.9,3.0,5.1,1.8,2


## separating features and target

In [29]:
X = iris_data.drop(columns=["Species"],axis=1)
Y = iris_data["Species"]

In [30]:
print(X)

     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
0              5.1           3.5            1.4           0.2
1              4.9           3.0            1.4           0.2
2              4.7           3.2            1.3           0.2
3              4.6           3.1            1.5           0.2
4              5.0           3.6            1.4           0.2
..             ...           ...            ...           ...
145            6.7           3.0            5.2           2.3
146            6.3           2.5            5.0           1.9
147            6.5           3.0            5.2           2.0
148            6.2           3.4            5.4           2.3
149            5.9           3.0            5.1           1.8

[150 rows x 4 columns]


In [31]:
print(Y)

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: Species, Length: 150, dtype: int64


In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [33]:
# print(X.shape,X_train.shape,X_test.shape)
print(Y_train.value_counts())
print(Y_test.value_counts())

Species
1    40
2    40
0    40
Name: count, dtype: int64
Species
0    10
2    10
1    10
Name: count, dtype: int64


## Train our logistic regration model

In [34]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs')

In [35]:
model.fit(X_train,Y_train)

## Model Evaluation , Accuracy score

### Accuracy score on tranning data

In [36]:
X_train_prediction  = model.predict(X_train)
from sklearn.metrics import accuracy_score
tranning_data_accuracy = accuracy_score(X_train_prediction,Y_train)
print("Accuracy on tranning data ", tranning_data_accuracy * 100)

Accuracy on tranning data  96.66666666666667


### Accuracy on test data

In [37]:
X_test_prediction  = model.predict(X_test)
from sklearn.metrics import accuracy_score
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)
print("Accuracy on tranning data ", test_data_accuracy * 100)

Accuracy on tranning data  100.0
