In [1]:
CSV_FILE = "./Resources/All.csv"
classes_dict = {0:'Diorite', 1:'Gabbro', 2:'Granite', 3:'Granodiorite'}

#### 1. LOAD DATA

In [2]:
import numpy as np
data = np.loadtxt(CSV_FILE, delimiter=",")

#data = data[data[:,-1] != 0]
#data = data[data[:,-1] != 1]
#data = data[data[:,-1] != 2]
#data = data[data[:,-1] != 3]

x = data[:, :-1]
y = data[:, -1]
print('x shape:', x.shape, 'y shape:', y.shape)

classes, indices, counts = np.unique(y, return_counts=True, return_index=True)

print('Classes:', classes)
print('Counts:', counts)
print('Indices:', indices)
print('Classes dictionary:', classes_dict)

x shape: (562, 16) y shape: (562,)
Classes: [0. 1. 2. 3.]
Counts: [151 137 159 115]
Indices: [274 425 115   0]
Classes dictionary: {0: 'Diorite', 1: 'Gabbro', 2: 'Granite', 3: 'Granodiorite'}


#### 2. SPLIT DATA

In [3]:
# Val = 20%, test = 16%, train = 64%
from sklearn.model_selection import train_test_split as splitter
xtt, xval, ytt, yval = splitter(x,y,train_size=0.8,random_state=42)

xtrain, xtest, ytrain, ytest = splitter(xtt,ytt,train_size=0.8,random_state=42)

In [4]:
def get_uniques(arr, name):
    print('<<', name, '>>')
    classes, indices, counts = np.unique(arr, return_counts=True, return_index=True)
    print('Classes:', classes)
    print('Counts:', counts)
    print('Indices:', indices)

In [5]:
get_uniques(ytrain, 'Train')
get_uniques(ytest, 'Test')
get_uniques(yval, 'Val')

<< Train >>
Classes: [0. 1. 2. 3.]
Counts: [ 95  92 101  71]
Indices: [2 1 0 8]
<< Test >>
Classes: [0. 1. 2. 3.]
Counts: [29 20 26 15]
Indices: [ 2  1  0 13]
<< Val >>
Classes: [0. 1. 2. 3.]
Counts: [27 25 32 29]
Indices: [2 0 5 3]


#### 3. NORMALIZE DATA

In [6]:
from sklearn.preprocessing import MinMaxScaler
normalizer = MinMaxScaler()
xtrainNorm = normalizer.fit_transform(xtrain)
xtestNorm = normalizer.fit_transform(xtest)

#### 4. TRAINING MODEL

In [7]:
#   Logistic Regression
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression(solver='lbfgs')
logReg.fit(xtrainNorm, ytrain)

LogisticRegression()

In [8]:
#   KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(xtrainNorm, ytrain)

KNeighborsClassifier(n_neighbors=3)

#### 5. TESTING MODEL

In [9]:
from sklearn import metrics

In [10]:
# Logistic Regression
print("<<MODELS REPORT LOG. REGRESSION>>")
print("Logistic Regression score:",logReg.score(xtestNorm, ytest))

predictions = logReg.predict(xtestNorm)
print("Logistic Regression Model accuracy:", metrics.accuracy_score(ytest, predictions))
print(metrics.classification_report(ytest,predictions))

<<MODELS REPORT LOG. REGRESSION>>
Logistic Regression score: 0.6
Logistic Regression Model accuracy: 0.6
              precision    recall  f1-score   support

         0.0       0.64      0.48      0.55        29
         1.0       0.58      0.75      0.65        20
         2.0       0.66      0.81      0.72        26
         3.0       0.40      0.27      0.32        15

    accuracy                           0.60        90
   macro avg       0.57      0.58      0.56        90
weighted avg       0.59      0.60      0.58        90



In [11]:
# KNN
print("<<MODEL REPORT KNN>>")
print("KNN score:",knn.score(xtestNorm, ytest))

predictions = knn.predict(xtestNorm)
print("KNN Model accuracy:", metrics.accuracy_score(ytest, predictions))
print(metrics.classification_report(ytest,predictions))

<<MODEL REPORT KNN>>
KNN score: 0.7
KNN Model accuracy: 0.7
              precision    recall  f1-score   support

         0.0       0.68      0.79      0.73        29
         1.0       0.70      0.70      0.70        20
         2.0       0.76      0.73      0.75        26
         3.0       0.64      0.47      0.54        15

    accuracy                           0.70        90
   macro avg       0.69      0.67      0.68        90
weighted avg       0.70      0.70      0.70        90



In [12]:
import pandas as pd

def get_evaluation_dataframe(model, xdata, ydata, classes_dict):
    real = []; predicted = []; accuracy = []
    
    classes, indices, counts = np.unique(ydata, return_counts=True, return_index=True)
    
    well_predicted = {key:0 for key in classes}
    #well_predicted.fromkeys(classes, 0)
    
    probabilities = model.predict_proba(x)
    
    for (features, lbl, probs) in zip(xdata, ydata, probabilities):
        r = int(lbl)
        real.append(classes_dict[r])
        p = int(model.predict([features]))
        predicted.append(classes_dict[p])
        accuracy.append(max(probs))
        if(r == p):
            well_predicted[r] += 1
    
    list_of_tuples = list(zip(real, predicted, accuracy))
    
    i = 0
    for cl in classes:
        idx = int(cl)
        print(classes_dict[idx], well_predicted[idx], counts[i])
        i+=1
    
    df = pd.DataFrame(list_of_tuples, columns = ['Real class', 'Predicted class', 'Accuracy'])
    
    return df

In [13]:
get_evaluation_dataframe(logReg, xval, yval, classes_dict)

Diorite 0 27
Gabbro 15 25
Granite 0 32
Granodiorite 27 29


Unnamed: 0,Real class,Predicted class,Accuracy
0,Gabbro,Gabbro,1.0
1,Gabbro,Gabbro,1.0
2,Diorite,Granodiorite,1.0
3,Granodiorite,Granodiorite,1.0
4,Diorite,Granodiorite,1.0
...,...,...,...
108,Granite,Granodiorite,1.0
109,Diorite,Granodiorite,1.0
110,Granodiorite,Granodiorite,1.0
111,Gabbro,Granodiorite,1.0


In [14]:
get_evaluation_dataframe(knn, xval, yval, classes_dict)

Diorite 0 27
Gabbro 7 25
Granite 32 32
Granodiorite 0 29


Unnamed: 0,Real class,Predicted class,Accuracy
0,Gabbro,Gabbro,1.000000
1,Gabbro,Gabbro,1.000000
2,Diorite,Granite,1.000000
3,Granodiorite,Granite,1.000000
4,Diorite,Granite,1.000000
...,...,...,...
108,Granite,Granite,1.000000
109,Diorite,Granite,1.000000
110,Granodiorite,Granite,1.000000
111,Gabbro,Granite,0.666667
