<a href="https://colab.research.google.com/github/sukcsie/DataScience-and-MachineLearning/blob/main/GlassClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Problem Statement**: Glass classification based on household and non-household glasses

**Approaches**: Decision Tree, Random Forest, SVM, Logistic Regression, KNN

**Data Set** can be downloaded from [kaggle](https://www.kaggle.com/uciml/glass) or from [UCI ML Repo](https://archive.ics.uci.edu/ml/datasets/glass+identification)

In [20]:
# importing libraries
import pandas as pd
import numpy as np
import imblearn
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from google.colab import files
files.upload()

# reading data into glass dataframe
glass = pd.read_csv("glass.csv")

Saving glass.csv to glass.csv


In [3]:
# peeking through the dataframe
glass.head()


Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [4]:
glass.isna().sum() # checking for null values

RI      0
Na      0
Mg      0
Al      0
Si      0
K       0
Ca      0
Ba      0
Fe      0
Type    0
dtype: int64

In [5]:
# shape of the dataframe
glass.shape

(214, 10)

In [6]:
# descriptive statistics of the data
glass.describe()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
count,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0
mean,1.518365,13.40785,2.684533,1.444907,72.650935,0.497056,8.956963,0.175047,0.057009,2.780374
std,0.003037,0.816604,1.442408,0.49927,0.774546,0.652192,1.423153,0.497219,0.097439,2.103739
min,1.51115,10.73,0.0,0.29,69.81,0.0,5.43,0.0,0.0,1.0
25%,1.516523,12.9075,2.115,1.19,72.28,0.1225,8.24,0.0,0.0,1.0
50%,1.51768,13.3,3.48,1.36,72.79,0.555,8.6,0.0,0.0,2.0
75%,1.519157,13.825,3.6,1.63,73.0875,0.61,9.1725,0.0,0.1,3.0
max,1.53393,17.38,4.49,3.5,75.41,6.21,16.19,3.15,0.51,7.0


In [7]:
# looking at types existing in the dataset
glass['Type'].value_counts()

2    76
1    70
7    29
3    17
5    13
6     9
Name: Type, dtype: int64

In [8]:
# glass of Type == 4 does not exist in the dataset
# creating another column and re-mapping the types to either '1' or '0' depending
# on whether its a household or non-household glasses

glass['household'] = glass.Type.map({1:0, 2:0, 3:0, 5:1, 6:1, 7:1})

In [9]:
# updated dataframe
glass.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type,household
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1,0
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1,0
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1,0
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1,0
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1,0


In [10]:
# getting the column names
glass.columns

Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type',
       'household'],
      dtype='object')

In [48]:
# getting the features
X = glass.loc[0:, ['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba','Fe']]# getting the class labels

In [49]:
X

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0
...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0


In [50]:
print(X.shape)

(214, 9)


In [14]:
# getting the labels
y = glass['household']
y

0      0
1      0
2      0
3      0
4      0
      ..
209    1
210    1
211    1
212    1
213    1
Name: household, Length: 214, dtype: int64

In [15]:
glass['household'].value_counts()

0    163
1     51
Name: household, dtype: int64

In [16]:
# you can also use scikit-learn counter to count the classes
# summarize class distribution

from collections import Counter
counter = Counter(glass['household'])
print(counter)

Counter({0: 163, 1: 51})


In [60]:
'''
# scatter plot of examples by class label
for label, _ in counter.items():
  #print(label)
  row_ix = np.where(y == label)[0]
  #print(row_ix)
  #print(X.iloc[row_ix][0])
  plt.scatter(X[row_ix], y[row_ix], label=str(label))
  '''

'\n# scatter plot of examples by class label\nfor label, _ in counter.items():\n  #print(label)\n  row_ix = np.where(y == label)[0]\n  #print(row_ix)\n  #print(X.iloc[row_ix][0])\n  plt.scatter(X[row_ix], y[row_ix], label=str(label))\n  '

In [63]:
# the dataset is unbalanced, so need to take care of the classes first
# transform the dataset

from imblearn.over_sampling import SMOTE

oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(y)
print(counter)

Counter({0: 163, 1: 163})


In [64]:
# train-test split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,random_state=15)

In [65]:
# scaling the values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train, y_train)

In [66]:
# transform the test
X_test_scaled = scaler.transform(X_test)

**Prediction using Decision Tree**


In [67]:
from sklearn import tree
dtc = tree.DecisionTreeClassifier()
dtc = dtc.fit(X_train_scaled, y_train)

In [68]:
y_pred_dt = dtc.predict(X_test_scaled)

In [69]:
# computing accuracy
from sklearn.metrics import accuracy_score
dt_result = accuracy_score(y_test, y_pred_dt)
print(dt_result)

0.9696969696969697


**Prediction using Random FOrest**

In [70]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=4, random_state=0)
rfc = rfc.fit(X_train_scaled, y_train)
y_pred_rf = rfc.predict(X_test_scaled)

In [71]:
rf_result = accuracy_score(y_test, y_pred_rf)
print(rf_result)

0.9696969696969697


**Prediction using SVM**

In [72]:
from sklearn import svm
svmc = svm.SVC()
svmc = svmc.fit(X_train_scaled, y_train)


In [73]:
y_pred_svm = svmc.predict(X_test_scaled)


In [74]:
svm_result = accuracy_score(y_test, y_pred_svm)
print(svm_result)

0.9696969696969697


**Prediction using Logistic Regression**

In [75]:
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression().fit(X_train_scaled, y_train)
y_pred_lrc = lrc.predict(X_test_scaled)

In [76]:
lr_result = accuracy_score(y_pred_lrc, y_test)
print(lr_result)

0.9090909090909091


**Prediction using KNN**

In [77]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn = knn.fit(X_train_scaled, y_train)


In [78]:
y_pred_knn = knn.predict(X_test_scaled)

In [79]:
knn_result = accuracy_score(y_pred_knn, y_test)
print(knn_result)

0.9696969696969697


**Comparison of different classifiers**

In [80]:
result_dict = {
    "Decision Tree" : dt_result,
    "Random Forest" : rf_result,
    "SVM" : svm_result,
    "Logistic Regression" : lr_result,
    "K-Nearest Neighbor" : knn_result
}


In [81]:
result_df = pd.DataFrame.from_dict(result_dict, orient='index', columns=['Accuracy'])
result_df.head()

Unnamed: 0,Accuracy
Decision Tree,0.969697
Random Forest,0.969697
SVM,0.969697
Logistic Regression,0.909091
K-Nearest Neighbor,0.969697
