### Goal:

#### In this Glass classification dataset, I want to build a Random Forest model to classify the glasses

### Overview of the dataset

In [None]:
### load required packages
import pandas as pd 
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns

In [None]:
### laod the dataset and get an overview
df = pd.read_csv('../input/glass.csv')
print(df.info())
print()

# the distribution of categories and visualization
print(df['Type'].value_counts().sort_values(ascending=False)) # six categories 
print()

type_name_li = df['Type'].value_counts().sort_values(ascending=False).index
type_value_li = df['Type'].value_counts().sort_values(ascending=False).values

import matplotlib 
matplotlib.style.use('ggplot')
fig, axes = plt.subplots(figsize=[15, 15])
axes.bar(list(range(len(type_name_li))), type_value_li, color='#2E8b57', width=0.8)
axes.tick_params(labelsize=15)
axes.set_xticks(np.arange(len(type_name_li))+0.4)
axes.set_xticklabels(type_name_li, fontsize=15)
axes.set_xlabel('Type', fontsize=20)
axes.set_ylabel('Count', fontsize=20)
axes.set_ylim(0, np.max(type_value_li)+5)
axes.set_title('Frequency of each Type', fontsize=25, loc='center')
plt.subplots_adjust(top=0.8)

# the distribution of features 
print(df.ix[:, df.columns[:-1]].describe())

### EDA on features 

In [None]:
### Distributions on features by Glass Type 
col_name_li = df.columns[:-1]

for col_name in col_name_li:
    fig = plt.figure(figsize=[30, 20])
    plt.suptitle(col_name + ' by Glass Type', fontsize=25, y=0.94)
    for (i, default) in enumerate(list(np.unique(df['Type'])), start=1):
        axes = fig.add_subplot(2, 3, i)
        axes.tick_params(labelsize=15)    
        sns.distplot(df[col_name][df['Type'] == default].values, color='#2E8B57')
        axes.set_xlabel(default, fontsize=15)
        axes.set_ylabel('Density', fontsize=15)

In [None]:
### Pairplot on features by Glass Type 
plt.figure(figsize=[50, 50])
sns.pairplot(df, vars=df.columns[:-1], hue='Type', palette='Paired', diag_kind='kde')

In [None]:
### correlations between each feature
cor_mat = df.ix[:, df.columns[:-1]].corr()

plt.figure(figsize=[10, 10])
sns.heatmap(cor_mat, square=True, annot=True, cmap="RdBu")
#plt.title('Correlations between each Feature', fontsize=20)
plt.suptitle('Correlations between each Feature', fontsize=20, y=0.94, horizontalalignment='center')

In [None]:
### get the correlated feature pairs

#print(cor_mat.index)

pair_li = []

for i in cor_mat.index:
    for j in cor_mat.index:
        if i == j:
            continue
        else:
            if np.round(np.abs(cor_mat.ix[i, j]), 1) >= 0.5:
                pair_li.append(tuple(sorted([i, j])))
            else:
                continue

print('\n')                
pair_li = list(set(pair_li))
print('Features pairs that have high correlations:')
print(pair_li)

intersect_li = []

for i in pair_li:
    temp_li = []
    temp_li.append(i)
    for j in pair_li:
        if i == j:
            continue
        else:
            if len(np.intersect1d(i, j)) != 0:
                temp_li.append(j)
            else:
                continue
    temp_li = tuple(sorted(temp_li))
    intersect_li.append(temp_li)

intersect_li = list(set(intersect_li))
print('\n')
print('Features pairs with high correlations that have overlapping values')
for i, pair in enumerate(intersect_li, start=1):
    print(str(i)+':', pair)

Features pairs with high correlations (equal to or higher than 0.5), taking the values close to 0.5 into account: 
(Al, Mg)
(Si, Rl)
(Ca, Rl)
(Ba, Mg)
(Ba, Al)

In [None]:
### to test if features has high correlations with the target values
# using the ANOVA

from scipy.stats import f_oneway

def cal_anova(val, label_val, label_li):
    val_li = []
    for label in label_li:
        val_li.append(val[label_val==label])
    return f_oneway(*val_li)

anova_table = pd.DataFrame(columns=['F_value', 'P_value'], index=df.columns[:-1])

for i in df.columns[:-1]:
    anova_table.ix[i, :] = cal_anova(df[i].values, df['Type'].values, np.unique(df['Type'].values))
print(anova_table.sort_values(['F_value'], ascending=False))

In [None]:
### from the correlations outcome above, pick up the useful features
for i, pair in enumerate(intersect_li, start=1):
    print(i, pair)

from itertools import chain 
for i, pair in enumerate(intersect_li, start=1):
    print(i, np.unique(list(chain(*pair))))

# remove "RI" and reserve the "Ca" and "Si"
# remove "Al" and "Ba", and reserve "Mg"

Here are the Features not used for modeling: 'RI', 'Al', 'Ba'

In [None]:
### extract the features used for modeling 
feature_li = df.columns[:-1]
feature_not_li = ['RI', 'Al', 'Ba']

feature_li = np.setdiff1d(feature_li, feature_not_li)
print(feature_li)

X = df.ix[:, feature_li]
y = df['Type']
print(X.head())
print(y.head())

### Modeling 

since the number of each label is not large enough, I evaluate the model simply based on the cross-validation set

In [None]:
### set the stratified splitter 

from sklearn.cross_validation import StratifiedKFold
skf_splitter = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=100)
for ind in skf_splitter:
    print(ind)
print(len(skf_splitter))

In [None]:
### building the model and evaluate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

accuracy_mean = 0
precision_mean = np.zeros(len(np.unique(y)))
recall_mean = np.zeros(len(np.unique(y)))
f1_mean = np.zeros(len(np.unique(y)))

for ind in skf_splitter:
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X.values[ind[0]], y.values[ind[0]])
    prediction = model.predict(X.values[ind[1]])
    
    print('Accuracy:\n', model.score(X.values[ind[1]], y.values[ind[1]]))
    accuracy_mean += model.score(X.values[ind[1]], y.values[ind[1]])
    print('Preision, Recall and F1:\n', precision_recall_fscore_support(y.values[ind[1]], prediction))
    precision_mean += precision_recall_fscore_support(y.values[ind[1]], prediction)[0]
    recall_mean += precision_recall_fscore_support(y.values[ind[1]], prediction)[1]
    f1_mean += precision_recall_fscore_support(y.values[ind[1]], prediction)[2]
    #print('Confusion Matrix:\n', confusion_matrix(y.values[ind[1]], prediction))
    print('Confusion Matrix:\n', pd.DataFrame(confusion_matrix(y.values[ind[1]], prediction), columns=sorted(np.unique(y)), 
                                             index=sorted(np.unique(y))))
    print('======================================================================\n\n')

fold = 5
print('Average Accuracy:', accuracy_mean/fold)
print('Average Precision:', precision_mean/fold)
print('Average Recall:', recall_mean/fold)
print('Average F1 score:', f1_mean/fold)

The model is not good enough. I'd like to use hyperparameters tuning to produce better model

In [None]:
### Grid search to choose the best model
from sklearn.grid_search import GridSearchCV
param_grid = {'n_estimators': [100, 500, 1000], 'criterion': ['gini', 'entropy'], 'max_features': ['auto', 'sqrt', 'log2'],
             'max_depth': [5, 10, 15, 20]}
model = RandomForestClassifier(n_jobs=-1)
grid = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid.fit(X, y)
print('done')

In [None]:
print(grid.best_score_)

In [None]:
print(grid.best_params_) 

In [None]:
### test on the best model on different subsets

model = grid.best_estimator_

fold = 5

from sklearn.cross_validation import StratifiedKFold
skf_splitter = StratifiedKFold(y, n_folds=fold, shuffle=True, random_state=1234) # set 1234 for different subsets

accuracy_mean = 0
precision_mean = np.zeros(len(np.unique(y)))
recall_mean = np.zeros(len(np.unique(y)))
f1_mean = np.zeros(len(np.unique(y)))

for ind in skf_splitter:
    prediction = model.predict(X.values[ind[1]])
    
    print('Accuracy:\n', model.score(X.values[ind[1]], y.values[ind[1]]))
    accuracy_mean += model.score(X.values[ind[1]], y.values[ind[1]])
    print('Preision, Recall and F1:\n', precision_recall_fscore_support(y.values[ind[1]], prediction))
    precision_mean += precision_recall_fscore_support(y.values[ind[1]], prediction)[0]
    recall_mean += precision_recall_fscore_support(y.values[ind[1]], prediction)[1]
    f1_mean += precision_recall_fscore_support(y.values[ind[1]], prediction)[2]
    #print('Confusion Matrix:\n', confusion_matrix(y.values[ind[1]], prediction))
    print('Confusion Matrix:\n', pd.DataFrame(confusion_matrix(y.values[ind[1]], prediction), columns=sorted(np.unique(y)), 
                                             index=sorted(np.unique(y))))
    print('======================================================================\n\n')


print('Average Accuracy:', accuracy_mean/fold)
print('Average Precision:', precision_mean/fold)
print('Average Recall:', recall_mean/fold)
print('Average F1 score:', f1_mean/fold)

#### Now the model looks better. If there are more data for further testing, it would give us more insights.*emphasized text*

### If you have any questions, feedback or suggestions, feel free to comment below, everything you want to say is quite welcome, thanks!