Import data

In [None]:
import numpy as np
import pandas as pd
path = "../input/comprehensive-database-of-minerals/Minerals_Database.csv"
minerals_df = pd.read_csv(path)

Preprocess data:
1. Extract features and target.
2. Scale features.
3. Categorize target classes.
4. Get class weights and convert to sample weights.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.utils import to_categorical
from sklearn.utils.class_weight import compute_class_weight

class MineralsDatasets:
    def __init__(self, X_train_scaled, X_test_scaled, y_train_categorical, y_test_categorical, sample_weights):
        self.X_train = X_train_scaled
        self.X_test = X_test_scaled
        self.y_train = y_train_categorical
        self.y_test = y_test_categorical
        self.sample_weights = sample_weights
        
def getFeaturesAndTargetData(minerals_df, columns_to_ignore, target_column):
#     minerals_df = minerals_df.drop(minerals_df.query('`{col}` == {triclinic}'.format(col=target_column, triclinic=0)).sample(frac=.3).index)
    X = minerals_df.drop(columns=columns_to_ignore+[target_column])
    y = minerals_df[target_column]
    return X, y

def scaleFeatures(train_data, test_data):
    scaler = StandardScaler()
    scaler.fit(train_data)
    train_data_scaled = scaler.transform(train_data, copy=True)
    test_data_scaled = scaler.transform(test_data, copy=True)
    return train_data_scaled, test_data_scaled

def categorizeTargets(train_targets, test_targets):
    y_train_categorical = to_categorical(train_targets)
    y_test_categorical = to_categorical(test_targets)
    return y_train_categorical, y_test_categorical

def getWeights(y_categorical):
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_categorical), y=y_categorical)
    sample_weights = np.array([class_weights[np.argmax(y)] for y in y_categorical])
    return sample_weights
    
def getSplittedAndNormalizedData(minerals_df):
    NON_CHEMICAL_FEATURES = ['Unnamed: 0', 'Name', 'Crystal Structure', 'Mohs Hardness',
       'Diaphaneity', 'Specific Gravity', 'Optical', 'Refractive Index',
       'Dispersion', 'count', 'Molar Mass', 'Molar Volume',
       'Calculated Density']
    TARGET = "Crystal Structure"
    X, y = getFeaturesAndTargetData(minerals_df, NON_CHEMICAL_FEATURES, TARGET)
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    X_train_scaled, X_test_scaled = scaleFeatures(X_train, X_test)
    y_train_categorical, y_test_categorical = categorizeTargets(y_train, y_test)
    sample_weights = getWeights(y_train.to_numpy())
    return MineralsDatasets(X_train_scaled, X_test_scaled, y_train_categorical, y_test_categorical, sample_weights)

Define neural network builder

In [None]:
from keras import Sequential
from keras.layers import Input, Dense, Dropout


class HiddenLayerSpecification:
    def __init__(self, size, activationFunction):
        self.size = size
        self.activationFunction = activationFunction

class ModelSpecification:
    def __init__(self, inputLength, numOfClasses, hiddenLayersSpecifications, optimizer, lossFunction, outputActivation, metrics, dropout):
        self.inputLength = inputLength
        self.numOfClasses = numOfClasses
        self.hiddenLayersSpecifications = hiddenLayersSpecifications
        self.optimizer = optimizer
        self.lossFunction = lossFunction
        self.outputActivation = outputActivation
        self.metrics = metrics
        self.dropout = dropout

def buildModel(spec):
    model = Sequential()
    model.add(Input(spec.inputLength,))
    for hiddenLayerSpec in spec.hiddenLayersSpecifications:
        model.add(Dense(hiddenLayerSpec.size, activation=hiddenLayerSpec.activationFunction))
        model.add(Dropout(spec.dropout))
    model.add(Dense(spec.numOfClasses, activation=spec.outputActivation))
    model.compile(optimizer=spec.optimizer, 
                  loss=spec.lossFunction, 
                  metrics=spec.metrics)
    return model

Fit and evaluate model.

In [None]:
import tensorflow as tf
from keras.metrics import Precision, Recall, TopKCategoricalAccuracy
from collections import Counter

class Result:
    def __init__(self, data, model, score):
        self.data = data
        self.model = model
        self.score = score

def getSpecification():
    hiddenLayersSpecifications = [HiddenLayerSpecification(data.X_train.shape[1], "relu"),
                                  HiddenLayerSpecification(1000, "relu"),
                                  HiddenLayerSpecification(100, "relu"),#rem
                                  HiddenLayerSpecification(100, "relu")]
    return ModelSpecification(inputLength = data.X_train.shape[1],
                               numOfClasses = data.y_test.shape[1],
                               hiddenLayersSpecifications = hiddenLayersSpecifications, 
                               optimizer = "adam",
                               lossFunction = "categorical_crossentropy",
                               outputActivation = "softmax",
                               metrics = ["accuracy", # how often predictions equal labels, 
                                          Recall(), # true_positives / (true_positives + false_negatives)
                                          Precision()], # true_positives / (true_positives + false_positives)],
                               dropout=0.7)

def fitOnNewModelAndData(minerals_df, modelSpecification, epochs):
    data = getSplittedAndNormalizedData(minerals_df)
    model = buildModel(modelSpecification)
    model.fit(data.X_train, data.y_train, epochs=epochs, verbose=0, sample_weight=data.sample_weights)
    score = model.evaluate(data.X_test, data.y_test, return_dict=True)
    return Result(data, model, score)

In [None]:
def get_avg_metric(results, metricName):
    return np.mean([r.score[metricName] for r in results])

ms = getSpecification()
results = [fitOnNewModelAndData(minerals_df, ms, 10) for i in range(10)]
avg_accuracy = get_avg_metric(results, "accuracy")
print('Avg accuracy: {}.'.format(avg_accuracy))