In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

**Data Exploration**

In [None]:
data = pd.read_csv("../input/glass.csv")

In [None]:
data.head()

The dataset consists of ten columns. Nine of them are numerical features, the last one is the type of glass. In this notebook, we are going to build a classifier for the type of glass.

In [None]:
data.describe()

There is great variation in the distribution of the numerical features, which necessitates the use of a scaler later on.

In [None]:
data["Type"].value_counts()

There are six types of glass, labeled with numbers 1-7 (excluding 4).

In [None]:
data.shape

In [None]:
sns.relplot(x="RI", y="Type", data=data)

In [None]:
sns.relplot(x="Na", y="Type", data=data)

In [None]:
sns.relplot(x="Mg", y="Type", data=data)

In [None]:
sns.relplot(x="Al", y="Type", data=data)

In [None]:
sns.relplot(x="Si", y="Type", data=data)

In [None]:
sns.relplot(x="K", y="Type", data=data)

In [None]:
sns.relplot(x="Ca", y="Type", data=data)

In [None]:
sns.relplot(x="Ba", y="Type", data=data)

In [None]:
sns.relplot(x="Fe", y="Type", data=data)

In [None]:
g = sns.PairGrid(data)
g.map(plt.scatter)

As we can see from the plots above, there does not seem to be any correlation between the numerical features.

**Preparation of the Training and the Test Data**

In [None]:
y = data["Type"]
X = data.drop(["Type"], axis=1)

In [None]:
numerical_features = ["RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)

**Model Construction and Evaluation**

In [None]:
numerical_transformer = Pipeline(steps=[
    ("std_scaler", StandardScaler())
])

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_features)
])

In [None]:
def evaluate_classifier(label, classifier):
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", classifier)
    ])
    pipeline.fit(X_train, y_train)
    print(label, ":", pipeline.score(X_test, y_test))
    y_pred = pipeline.predict(X_test)
    print(confusion_matrix(y_test, y_pred))

Let us evaluate a bunch of classifiers, with their default settings.

In [None]:
evaluate_classifier("Decision Tree",
                    DecisionTreeClassifier(random_state=42))
evaluate_classifier("K Neighbors",
                    KNeighborsClassifier())
evaluate_classifier("Support Vector Machine",
                    SVC(random_state=42))
evaluate_classifier("Random Forest",
                    RandomForestClassifier(random_state=42))
evaluate_classifier("Multi-Layer Perceptron",
                    MLPClassifier(random_state=42))

The random forest and multi-layer perceptron classifiers seem to be the most promising. Let us try a bunch of values for the n_estimators parameter for the random forest classifier.

In [None]:
evaluate_classifier("Random Forest, n=5",
                    RandomForestClassifier(random_state=42, n_estimators=5))
evaluate_classifier("Random Forest, n=10",
                    RandomForestClassifier(random_state=42, n_estimators=10))
evaluate_classifier("Random Forest, n=20",
                    RandomForestClassifier(random_state=42, n_estimators=20))
evaluate_classifier("Random Forest, n=50",
                    RandomForestClassifier(random_state=42, n_estimators=50))
evaluate_classifier("Random Forest, n=100",
                    RandomForestClassifier(random_state=42, n_estimators=100))
evaluate_classifier("Random Forest, n=200",
                    RandomForestClassifier(random_state=42, n_estimators=200))

Random forest classifiers with n=50 give the best score of 0.86.

Let us now try out a bunch of multi-layer perceptron classifiers with different parameters.

In [None]:
evaluate_classifier("MLP Classifier, hidden_layer_size=20",
                   MLPClassifier(random_state=42, hidden_layer_sizes=(20,)))
evaluate_classifier("MLP Classifier, hidden_layer_size=50",
                   MLPClassifier(random_state=42, hidden_layer_sizes=(50,)))
evaluate_classifier("MLP Classifier, hidden_layer_size=100",
                   MLPClassifier(random_state=42, hidden_layer_sizes=(100,)))
evaluate_classifier("MLP Classifier, hidden_layer_size=200",
                   MLPClassifier(random_state=42, hidden_layer_sizes=(200,)))

The default size for the hidden layer, 100, works out well.

After some fiddling, I found the following parameters for a MLP Classifier that give a score of 0.86: three hidden layers with 80 neurons each, and alpha=1. The number of neurons seems too high for this job. It seems that the random forest model is more suitable.

In [None]:
evaluate_classifier("MLP Classifier",
                   MLPClassifier(random_state=42,
                                 hidden_layer_sizes=(80,80,80),
                                alpha=1))