In [1]:
#import libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
#import data
path = 'iris.data'
data = pd.read_csv(path, header=None)
data.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
#visualize data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       150 non-null    float64
 1   1       150 non-null    float64
 2   2       150 non-null    float64
 3   3       150 non-null    float64
 4   4       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [4]:
#convert target column to class number using pandas
data.loc[data[4] == 'Iris-setosa', 4] = 1
data.loc[data[4] == 'Iris-versicolor', 4] = 2
data.loc[data[4] == 'Iris-virginica', 4] = 3
data

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,1
4,5.0,3.6,1.4,0.2,1
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,3
146,6.3,2.5,5.0,1.9,3
147,6.5,3.0,5.2,2.0,3
148,6.2,3.4,5.4,2.3,3


In [5]:
#select data for label 1 if label = 1 -> 1 else 0
data1 = data.copy()
data1.loc[(data1[4] == 2) | (data1[4] == 3), 4] = 0
print(data1[4].value_counts())

0    100
1     50
Name: 4, dtype: int64


In [6]:
#select data for label 2 if label = 2 -> 1 else 0
data2 = data.copy()
data2.loc[(data2[4] == 1) | (data2[4] == 3),4] = 0
data2.loc[(data2[4] == 2) ,4] = 1
print(data2[4].value_counts())

0    100
1     50
Name: 4, dtype: int64


In [7]:
#select data for label 3 if label = 3 -> 1 else 0
data3 = data.copy()
data3.loc[(data3[4] == 1) | (data3[4] == 2), 4] = 0
data3.loc[(data3[4] == 3), 4] = 1
print(data3[4].value_counts())

0    100
1     50
Name: 4, dtype: int64


In [8]:
#normalize data
def norm_data(data):
    norm_features = StandardScaler().fit_transform(data.loc[:,0:3])
    targets= data[4]
    targets = targets.astype(int)
    return norm_features, targets

norm_features1, targets1 = norm_data(data1) #C1
norm_features2, targets2 = norm_data(data2) #C2
norm_features3, targets3 = norm_data(data3) #C3

In [9]:
#separate data into train/test sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(norm_features1,targets1, test_size=0.25, random_state=42)
X_train2, X_test2, y_train2, y_test2 = train_test_split(norm_features2,targets2, test_size=0.25, random_state=42)
X_train3, X_test3, y_train3, y_test3 = train_test_split(norm_features3,targets3, test_size=0.25, random_state=42)


In [10]:
#apply LDA
lda1 = LinearDiscriminantAnalysis()
lda2 = LinearDiscriminantAnalysis()
lda3 = LinearDiscriminantAnalysis()
X_train1 = lda1.fit_transform(X_train1,y_train1)
X_train2 = lda2.fit_transform(X_train2,y_train2)
X_train3 = lda3.fit_transform(X_train3,y_train3)

In [11]:
#test LDA
y_pred1 = lda1.predict(X_test1)
y_pred2 = lda2.predict(X_test2)
y_pred3 = lda3.predict(X_test3)

In [12]:
t_names1 = ['not setosa', 'setosa']
t_names2 = ['not versicolor', 'versicolor']
t_names3 = ['not verginica', 'verginica']

print(classification_report(y_test1,y_pred1, target_names=t_names1))

              precision    recall  f1-score   support

  not setosa       1.00      1.00      1.00        23
      setosa       1.00      1.00      1.00        15

    accuracy                           1.00        38
   macro avg       1.00      1.00      1.00        38
weighted avg       1.00      1.00      1.00        38



In [13]:
print(classification_report(y_test2,y_pred2, target_names=t_names2))

                precision    recall  f1-score   support

not versicolor       0.83      0.89      0.86        27
    versicolor       0.67      0.55      0.60        11

      accuracy                           0.79        38
     macro avg       0.75      0.72      0.73        38
  weighted avg       0.78      0.79      0.78        38



In [14]:
print(classification_report(y_test3,y_pred3, target_names=t_names3))

               precision    recall  f1-score   support

not verginica       1.00      0.92      0.96        26
    verginica       0.86      1.00      0.92        12

     accuracy                           0.95        38
    macro avg       0.93      0.96      0.94        38
 weighted avg       0.95      0.95      0.95        38

