In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix

import matplotlib.pyplot as plt


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# load the dataset
df = pd.read_csv('../input/comprehensive-database-of-minerals/Minerals_Database.csv')
df.head()

In [None]:
# filter the dataset for Refractive Index != 0
df = df[df['Refractive Index'] != 0]

In [None]:
# size of final dataset
len(df)

In [None]:
# check for columns containing all zeros (they will be removed later)
features_with_all_zeroes = []

for feature in df.columns:
    if len(df[feature].unique()) == 1 and df[feature].unique()[0] == 0:
        features_with_all_zeroes.append(feature)
print(features_with_all_zeroes)

In [None]:
# define selected features for training
selected_features = np.asarray(df.columns)
# Remove Unnamed: 0 and Name
selected_features = selected_features[2:]
# Remove features with all zeroes previously computed
for feature_with_all_zeroes in features_with_all_zeroes:
    selected_features = np.delete(selected_features, np.argwhere(selected_features==feature_with_all_zeroes), None)
# now selected features contains only meaningful columns
    
# Remove target column
selected_features = np.delete(selected_features, np.argwhere(selected_features=='Refractive Index'), None)

In [None]:
# search and remove feature each other correlated

df_corr = df.corr()

correlated_features = set()

for row_feature in df_corr.index:
    #skip non meaningfull columns
    if 'Unnamed' in row_feature or 'Density' in row_feature:
        continue
    for col_feature in df_corr.columns:
        #skip non meaningfull columns
        if 'Unnamed' in col_feature or 'Density' in col_feature:
            continue
        #skip columns against itself
        if col_feature == row_feature:
            continue
        if df_corr[row_feature][col_feature] > 0.85:
            correlated_features.add(row_feature)
            #correlated_features.add(col_feature)
print(correlated_features)

# remove them from selected features
for correlated_feature in correlated_features:
    selected_features = np.delete(selected_features, np.argwhere(selected_features==correlated_feature), None)


In [None]:
# target preparation:
# create a new category column by dividing the refractive index in several classes: this will be the target of the classifier

def compute_refr_index_category(ri):
    if ri >= 0 and ri < 0.5:
        return 0
    elif ri >= 0.5 and ri < 1:
        return 1
    elif ri >= 1 and ri < 1.5:
        return 2
    elif ri >= 1.5 and ri < 2:
        return 3
    elif ri >= 2 and ri < 2.5:
        return 4
    elif ri >= 2.5 and ri < 3:
        return 5
    elif ri >= 3 and ri < 3.5:
        return 6    
    
df['refractive_index_category'] = df.apply(lambda row : compute_refr_index_category(row['Refractive Index']), axis=1)

In [None]:
# check how the dataset is distributed vs refractive_index_category
# see how for classes 1,5,6 the number of samples is very low: the related rows will be removed
# also, the resulting dataset will be highly unbalanced (2 and 4 categories have much less samples than 3)

# The classification will be done only for categories 2,3,4
df.groupby(['refractive_index_category']).count()

In [None]:
# remove classes that have only few samples
df = df[(df['refractive_index_category'] > 1) & (df['refractive_index_category'] < 5)]

In [None]:
# train and classify 

X = df[selected_features]
Y = df['refractive_index_category']

X_train, X_test,y_train, y_test = train_test_split(X,Y, test_size=0.3, random_state=0, stratify=Y)

scaler = StandardScaler()
scaler.fit_transform(X_train)
scaler.transform(X_test);

classifier = RandomForestClassifier(n_estimators=20, min_samples_split=9, max_depth=5, class_weight='balanced', random_state=0)
classifier.fit(X_train,y_train)
classifier.score(X_train,y_train), classifier.score(X_test,y_test)

In [None]:
# confusion matrix on train
plot_confusion_matrix(classifier, X_train, y_train, cmap=plt.cm.Blues,normalize='true')

In [None]:
# confusion matrix on test

# see how the classifier has some problems in distinguish between classes 2 and 3

plot_confusion_matrix(classifier, X_test, y_test, cmap=plt.cm.Blues, normalize='true');

# Conclusions

This is just a starting point to evaluate feasibility

Performances may have been affected by the small size of the dataset (only 588 rows with a valid "refractive index" value).

Adding more data on "refractive index" column could lead to more interesting results