In [1]:
trainFile = "./Resources/rocks_color.csv"

DIR = "/Users/sarah/Google Drive/Sem VII/RP Investigaci√≥n/Rock Classifier 4 copy"
IMAGE_SIZE = (512, 512)

CLUSTERS = 4

In [2]:
# 1. LOAD DATA
import numpy as np
data = np.loadtxt(trainFile, delimiter=",")

#data = data[data[:,-1] != 0]
#data = data[data[:,-1] != 1]
#data = data[data[:,-1] != 2]
#data = data[data[:,-1] != 3]

classes = np.unique(data[:,-1])
print('Rock types:', classes)

x = data[:, :-1]
y = data[:, -1]
print(x.shape, y.shape)

Rock types: [0. 1. 2. 3.]
(81, 16) (81,)


In [3]:
# 2. SPLIT DATA
from sklearn.model_selection import train_test_split as splitter
xtrain, xtest,ytrain,ytest = splitter(x,y,train_size=0.7,random_state=42)

In [4]:
# 3. NORMALIZE DATA
from sklearn.preprocessing import MinMaxScaler
normalizer = MinMaxScaler()
xtrainNorm = normalizer.fit_transform(xtrain)
xtestNorm = normalizer.fit_transform(xtest)

In [5]:
# 4. TRAINING MODEL
#   Logistic Regression
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression(solver='lbfgs')
logReg.fit(xtrainNorm, ytrain)

LogisticRegression()

In [6]:
#   KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(xtrainNorm, ytrain)

KNeighborsClassifier(n_neighbors=3)

In [7]:
# 5. TESTING MODEL
from sklearn import metrics

# Logistic Regression
print("\n<<MODELS REPORT LLOG. REGRESSION>>")
print("Logistic Regression:",logReg.score(xtestNorm, ytest))

predictions = logReg.predict(xtestNorm)
print("Logistic Regression Model accuracy:", metrics.accuracy_score(ytest, predictions))
print(metrics.classification_report(ytest,predictions))


<<MODELS REPORT LLOG. REGRESSION>>
Logistic Regression: 0.4
Logistic Regression Model accuracy: 0.4
              precision    recall  f1-score   support

         0.0       0.38      0.30      0.33        10
         1.0       0.38      1.00      0.55         3
         2.0       0.00      0.00      0.00         8
         3.0       0.44      1.00      0.62         4

    accuracy                           0.40        25
   macro avg       0.30      0.57      0.37        25
weighted avg       0.27      0.40      0.30        25



  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
# KNN
print("\n<<MODEL REPORT KNN>>")
print("KNN:",knn.score(xtestNorm, ytest))

predictions = knn.predict(xtestNorm)
print("KNN Model accuracy:", metrics.accuracy_score(ytest, predictions))
print(metrics.classification_report(ytest,predictions))


<<MODEL REPORT KNN>>
KNN: 0.8
KNN Model accuracy: 0.8
              precision    recall  f1-score   support

         0.0       0.82      0.90      0.86        10
         1.0       0.67      0.67      0.67         3
         2.0       1.00      0.62      0.77         8
         3.0       0.67      1.00      0.80         4

    accuracy                           0.80        25
   macro avg       0.79      0.80      0.77        25
weighted avg       0.83      0.80      0.80        25



# Make predictions

In [9]:
# Libraries
import tensorflow as tf
import cv2, os
from PIL import Image, ImageOps
import numpy as np
from itertools import repeat

def load_images_from_folder(folder, size):
    images = []
    for filename in os.listdir(folder):
        if(not filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'))):
            print(filename, 'file removed')
            continue
        img = Image.open(os.path.join(folder,filename))
        fit_and_resized_image = ImageOps.fit(img, size, Image.ANTIALIAS)
        img = np.array(fit_and_resized_image)
        img = img[...,:3]
        if img is not None:
            images.append(img)
    return images

def load_images_from_directory(path, size):
    paths = os.listdir(path)
    x = []; y = []; lbl_dict = {}; lbl_num = 0; counts = []
    for folder in paths:
        if(os.path.isfile(folder)):#  or folder == 'granodiorite'
            print(folder, 'file removed')
            continue
        images = load_images_from_folder(os.path.join(path, folder), size)
        lbl_nums = list(repeat(lbl_num, len(images)))
        x.extend(images)
        y.extend(lbl_nums)
        counts.append(len(lbl_nums))
        lbl_dict[lbl_num] = folder
        lbl_num += 1
    return (x, y, lbl_dict, counts)

In [10]:
x_pred, y_pred, labels, counts = load_images_from_directory(DIR, IMAGE_SIZE)

print(list(labels.values()), counts)
print('Samples shape:', np.shape(x_pred))

.DS_Store file removed
.DS_Store file removed
.DS_Store file removed
.DS_Store file removed
.DS_Store file removed
['granodiorite', 'granite', 'diorite', 'gabbro'] [3, 3, 3, 3]
Samples shape: (12, 512, 512, 3)


In [11]:
def get_dominant_colors(cluster, centroids):
    # Get the number of different clusters, create histogram, and normalize
    lbls = np.arange(0, len(np.unique(cluster.labels_)) + 1)
    (hist, _) = np.histogram(cluster.labels_, bins = lbls)
    hist = hist.astype("float"); hist /= hist.sum()

    # Color and frecuency list by the iteration through each cluster
    features = []
    colors = sorted([(percent, color) for (percent, color) in zip(hist, centroids)])
    
    for (percent, color) in colors:
        features.extend(color)
        features.append(percent)

    return features

In [12]:
from sklearn.cluster import KMeans

extracted_colors = []
for img in x_pred:
    reshape = img.reshape((img.shape[0] * img.shape[1], 3))
    cluster = KMeans(n_clusters=CLUSTERS).fit(reshape)
    features = get_dominant_colors(cluster, cluster.cluster_centers_)
    extracted_colors.append(features)

print(extracted_colors[0])
print(np.shape(extracted_colors))

[106.01729909061288, 102.64074765247786, 94.47999052104623, 0.1292266845703125, 143.7068311195446, 140.20458162453326, 134.65337883332313, 0.24884033203125, 201.65234386064296, 199.23710522588868, 197.20875230137375, 0.2700042724609375, 173.9617211420905, 170.9813690804635, 167.55651487119565, 0.3519287109375]
(12, 16)


In [13]:
from PIL import Image

name = 61
#for img in x_pred:
 #   Image.fromarray(img).save(str(name)+".png")
  #  name+=1

In [14]:
classes = {0: 'granodiorite', 1: 'granite', 2: 'diorite', 3: 'gabbro'}
print(labels)

{0: 'granodiorite', 1: 'granite', 2: 'diorite', 3: 'gabbro'}


In [15]:
# 3. NORMALIZE DATA
from sklearn.preprocessing import MinMaxScaler
tt = MinMaxScaler()
color_norm = tt.fit_transform(extracted_colors)

In [16]:
print(labels[y_pred[0]])
print(classes[int(knn.predict([extracted_colors[0]])[0])])

granodiorite
granodiorite


In [17]:
print(knn.score(extracted_colors, y_pred))

0.25


In [18]:
output_data = logReg.predict_proba(extracted_colors)

real = []
predicted = []
precision = []

for i in range(0, len(output_data)):
    real.append(labels[y_pred[i]])
    predicted.append(classes[int(knn.predict([extracted_colors[i]]))])
    precision.append(max(output_data[i]))
#    print(classes[y_pred[i]], classes[np.argmax(output_data[i])], max(output_data[i]))

In [19]:
import pandas as pd

# get the list of tuples from two lists.  
# and merge them by using zip().  
list_of_tuples = list(zip(real, predicted, precision))  
  
# Converting lists of tuples into  
# pandas Dataframe.  
df = pd.DataFrame(list_of_tuples, columns = ['Real class', 'Predicted class', 'Precision']) 

df

Unnamed: 0,Real class,Predicted class,Precision
0,granodiorite,granodiorite,1.0
1,granodiorite,granodiorite,1.0
2,granodiorite,granodiorite,1.0
3,granite,granodiorite,0.989216
4,granite,granodiorite,1.0
5,granite,granodiorite,1.0
6,diorite,granodiorite,1.0
7,diorite,granodiorite,1.0
8,diorite,granodiorite,1.0
9,gabbro,granodiorite,1.0


In [20]:
granite = 0
diorite = 0
gabbro = 0

print('<< Well predicted >>')

for i in range(len(real)):
    if(real[i] == predicted[i]):
        if(real[i] == 'granite'):
            granite+=1
        elif(real[i] == 'diorite'):
            diorite+=1
        elif(real[i] == 'gabbro'):
            gabbro+=1
print('granite', granite, '/ 20')
print('diorite', diorite, '/ 20')
print('gabbro', gabbro, '/ 20')

<< Well predicted >>
granite 0 / 20
diorite 0 / 20
gabbro 0 / 20
