In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import make_column_selector, make_column_transformer
 

In [11]:
df = pd.read_csv(r'C:\Users\DAI.STUDENTSDC\Desktop\Machine Learning\Data Sets\Cases\Satellite Imaging\Satellite.csv', sep=';')

X = df.drop(['classes'], axis=1)
y = df['classes']

In [12]:
y.value_counts()

classes
red soil               1533
very damp grey soil    1508
grey soil              1358
vegetation stubble      707
cotton crop             703
damp grey soil          626
Name: count, dtype: int64

In [14]:
X.describe()

Unnamed: 0,x.1,x.2,x.3,x.4,x.5,x.6,x.7,x.8,x.9,x.10,...,x.27,x.28,x.29,x.30,x.31,x.32,x.33,x.34,x.35,x.36
count,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,...,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0
mean,69.4,83.594872,99.290598,82.592696,69.150272,83.243512,99.110645,82.497125,68.912354,82.893085,...,99.214763,82.660606,68.944056,83.14561,99.111888,82.618026,68.727584,82.858897,98.92603,82.505361
std,13.605871,22.882234,16.645944,18.897674,13.561197,22.886495,16.664088,18.940923,13.470599,22.862255,...,16.612514,18.991281,13.492684,22.847199,16.704305,19.043661,13.401603,22.816959,16.695488,19.054274
min,39.0,27.0,53.0,33.0,39.0,27.0,50.0,29.0,40.0,27.0,...,50.0,29.0,39.0,27.0,50.0,29.0,39.0,27.0,50.0,29.0
25%,60.0,71.0,85.0,69.0,60.0,71.0,85.0,69.0,60.0,71.0,...,85.0,69.0,60.0,71.0,85.0,69.0,60.0,71.0,85.0,68.0
50%,68.0,87.0,101.0,81.0,68.0,85.0,101.0,81.0,67.0,85.0,...,100.0,81.0,68.0,85.0,100.0,81.0,67.0,84.0,100.0,81.0
75%,80.0,103.0,113.0,92.0,80.0,103.0,113.0,92.0,79.0,102.0,...,113.0,92.0,79.0,103.0,113.0,92.0,79.0,103.0,113.0,92.0
max,104.0,137.0,140.0,154.0,104.0,137.0,145.0,157.0,104.0,130.0,...,140.0,154.0,104.0,130.0,145.0,157.0,104.0,130.0,145.0,157.0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   x.1      6435 non-null   int64 
 1   x.2      6435 non-null   int64 
 2   x.3      6435 non-null   int64 
 3   x.4      6435 non-null   int64 
 4   x.5      6435 non-null   int64 
 5   x.6      6435 non-null   int64 
 6   x.7      6435 non-null   int64 
 7   x.8      6435 non-null   int64 
 8   x.9      6435 non-null   int64 
 9   x.10     6435 non-null   int64 
 10  x.11     6435 non-null   int64 
 11  x.12     6435 non-null   int64 
 12  x.13     6435 non-null   int64 
 13  x.14     6435 non-null   int64 
 14  x.15     6435 non-null   int64 
 15  x.16     6435 non-null   int64 
 16  x.17     6435 non-null   int64 
 17  x.18     6435 non-null   int64 
 18  x.19     6435 non-null   int64 
 19  x.20     6435 non-null   int64 
 20  x.21     6435 non-null   int64 
 21  x.22     6435 non-null   int64 
 22  

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)

In [18]:
lda = LinearDiscriminantAnalysis()
pipe = Pipeline([
    ('DA', lda)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("LDA Accuracy: ",accuracy_score(y_test,y_pred))

LDA Accuracy:  0.8353184878301398


In [19]:
qda = QuadraticDiscriminantAnalysis()
pipe = Pipeline([
    ('DA', qda)
])  

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("QDA Accuracy: ",accuracy_score(y_test,y_pred))


QDA Accuracy:  0.8503366131538063


---
LDA(Transformation) With KNN(Classification)

In [28]:
lda = LinearDiscriminantAnalysis()
knnc = KNeighborsClassifier()

pipe = Pipeline([
    ('DA', lda),
    ('KNNC', knnc)
])


params = {
    'KNNC__n_neighbors': np.arange(1,9)
}

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)


# Apply GridSearchCV
gcv = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring='accuracy')
gcv.fit(X_train, y_train)


y_pred = gcv.best_estimator_.predict(X_test)

# Output best parameters and scores
print("Best Parameters:", gcv.best_params_)
print("Best Score:", gcv.best_score_)

print("LDA Accuracy: ",accuracy_score(y_test,y_pred))

Best Parameters: {'KNNC__n_neighbors': 7}
Best Score: 0.8761092613145888
LDA Accuracy:  0.875712066286898
