Imports

In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline

Read data

In [None]:
path = "../input/comprehensive-database-of-minerals/Minerals_Database.csv"
minerals_df = pd.read_csv(path)

In [None]:
minerals_df.describe()

In [None]:
minerals_df.columns

## Version 1 Only the chemical formula

In [None]:
nonChemicalFormulaHeaders = ["Unnamed: 0", "Name", "Crystal Structure", "Mohs Hardness", "Diaphaneity", "Specific Gravity", "Optical", "Refractive Index",
       "Dispersion"]
X_ch = minerals_df.drop(columns=nonChemicalFormulaHeaders)
y_ch = minerals_df["Crystal Structure"]
X_ch_train, X_ch_test, y_ch_train, y_ch_test = train_test_split(X_ch, y_ch, test_size=0.2, random_state=42)

Scale data

In [None]:
scaler_ch = StandardScaler()
scaler_ch.fit(X_ch_train)
X_ch_train_scaled = scaler_ch.transform(X_ch_train, copy=True)

Find best SVM parameters

In [None]:
parameters = {'kernel':['rbf'], 'C':[0.01, 0.1, 1, 10, 100], 'gamma':[0.001, 0.01, 0.1, 1, 10, 100], 'class_weight':['balanced']}
svc_ch = SVC()
grid_ch = GridSearchCV(svc_ch, parameters)
grid_ch.fit(X_ch_train_scaled, y_ch_train)

In [None]:
print(grid_ch.best_estimator_)
print(grid_ch.best_score_)

Validate on test set

In [None]:
X_ch_test_scaled = scaler_ch.transform(X_ch_test)
result_ch = grid_ch.best_estimator_.predict(X_ch_test_scaled)
print("accuracy: ", sum(result_ch == y_ch_test)/len(y_ch_test))

## Version 2 all features

In [None]:
nonFeatureHeaders = ["Unnamed: 0", "Name", "Crystal Structure"]
X_all = minerals_df.drop(columns=nonFeatureHeaders)
y_all = minerals_df["Crystal Structure"]
X_all_train, X_all_test, y_all_train, y_all_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

Scale data

In [None]:
scaler_all = StandardScaler()
scaler_all.fit(X_all_train)
X_all_train_scaled = scaler_all.transform(X_all_train, copy=True)

Find best SVM parameters

In [None]:
parameters = {'kernel':['rbf'], 'C':[0.01, 0.1, 1, 10, 100], 'gamma':[0.001, 0.01, 0.1, 1, 10, 100], 'class_weight':['balanced']}
svc_all = SVC()
grid_all = GridSearchCV(svc_all, parameters)
grid_all.fit(X_all_train_scaled, y_all_train)

In [None]:
print(grid_all.best_estimator_)
print(grid_all.best_score_)

Validate on test set

In [None]:
X_all_test_scaled = scaler_all.transform(X_all_test)
result_all = grid_all.best_estimator_.predict(X_all_test_scaled)
print("accuracy: ", sum(result_all == y_all_test)/len(y_all_test))