# Glass Identification Project

In [44]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, classification_report, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from collections import Counter
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')

In [45]:
data=pd.read_csv('https://raw.githubusercontent.com/dsrscientist/dataset3/main/glass.csv')
data

Unnamed: 0,1,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.00.1,1.1
0,2,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.00,1
1,3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.00,1
2,4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.00,1
3,5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.00,1
4,6,1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.00,0.26,1
...,...,...,...,...,...,...,...,...,...,...,...
208,210,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.00,7
209,211,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.00,7
210,212,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.00,7
211,213,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.00,7


In [46]:
data.columns=['ID','RI', 'Na', 'Mg', 
                   'Al', 'Si', 'K', 'Ca', 'Ba',
                       'Fe', 'Type_of_glass']

In [47]:
data

Unnamed: 0,ID,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type_of_glass
0,2,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.00,1
1,3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.00,1
2,4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.00,1
3,5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.00,1
4,6,1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.00,0.26,1
...,...,...,...,...,...,...,...,...,...,...,...
208,210,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.00,7
209,211,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.00,7
210,212,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.00,7
211,213,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.00,7


In [48]:
data=data.drop(['ID'], axis=1)
data

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type_of_glass
0,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.00,1
1,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.00,1
2,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.00,1
3,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.00,1
4,1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.00,0.26,1
...,...,...,...,...,...,...,...,...,...,...
208,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.00,7
209,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.00,7
210,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.00,7
211,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.00,7


In [49]:
data.describe()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type_of_glass
count,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0
mean,1.518353,13.406761,2.676056,1.446526,72.655023,0.499108,8.957934,0.175869,0.057277,2.788732
std,0.003039,0.818371,1.440453,0.499882,0.774052,0.653035,1.426435,0.498245,0.097589,2.10513
min,1.51115,10.73,0.0,0.29,69.81,0.0,5.43,0.0,0.0,1.0
25%,1.51652,12.9,2.09,1.19,72.28,0.13,8.24,0.0,0.0,1.0
50%,1.51768,13.3,3.48,1.36,72.79,0.56,8.6,0.0,0.0,2.0
75%,1.51915,13.83,3.6,1.63,73.09,0.61,9.18,0.0,0.1,3.0
max,1.53393,17.38,3.98,3.5,75.41,6.21,16.19,3.15,0.51,7.0


In [50]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213 entries, 0 to 212
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   RI             213 non-null    float64
 1   Na             213 non-null    float64
 2   Mg             213 non-null    float64
 3   Al             213 non-null    float64
 4   Si             213 non-null    float64
 5   K              213 non-null    float64
 6   Ca             213 non-null    float64
 7   Ba             213 non-null    float64
 8   Fe             213 non-null    float64
 9   Type_of_glass  213 non-null    int64  
dtypes: float64(9), int64(1)
memory usage: 16.8 KB


In [51]:
data=data.dropna()

In [52]:
data.isnull().sum()

RI               0
Na               0
Mg               0
Al               0
Si               0
K                0
Ca               0
Ba               0
Fe               0
Type_of_glass    0
dtype: int64

In [53]:
data

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type_of_glass
0,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.00,1
1,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.00,1
2,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.00,1
3,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.00,1
4,1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.00,0.26,1
...,...,...,...,...,...,...,...,...,...,...
208,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.00,7
209,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.00,7
210,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.00,7
211,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.00,7


In [54]:
y=data['Type_of_glass']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: Type_of_glass, dtype: int64

In [55]:
data=data.drop('Type_of_glass', axis=1)
data.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0
1,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0
2,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0
3,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0
4,1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.0,0.26


In [57]:
from sklearn import preprocessing
x=data.values
min_max_scaler=preprocessing.MinMaxScaler()
x_scaled=min_max_scaler.fit_transform(x)
data=pd.DataFrame(x_scaled)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.283582,0.475188,0.904523,0.333333,0.521429,0.077295,0.223048,0.000000,0.000000
1,0.220808,0.421053,0.891960,0.389408,0.567857,0.062802,0.218401,0.000000,0.000000
2,0.285777,0.372932,0.927136,0.311526,0.500000,0.091787,0.259294,0.000000,0.000000
3,0.275241,0.381955,0.909548,0.295950,0.583929,0.088567,0.245353,0.000000,0.000000
4,0.211150,0.309774,0.907035,0.414330,0.564286,0.103060,0.245353,0.000000,0.509804
...,...,...,...,...,...,...,...,...,...
208,0.223003,0.512782,0.000000,0.806854,0.500000,0.012882,0.348513,0.336508,0.000000
209,0.250219,0.630075,0.000000,0.529595,0.580357,0.000000,0.276022,0.504762,0.000000
210,0.417032,0.545865,0.000000,0.538941,0.644643,0.000000,0.279740,0.520635,0.000000
211,0.235294,0.548872,0.000000,0.514019,0.678571,0.000000,0.283457,0.498413,0.000000


In [61]:
# splitting the data for test and training
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(data,y,test_size=0.30, random_state=42)
print((X_train.shape))

(149, 9)


In [62]:
!pip install pydotplus

Collecting pydotplus
  Downloading pydotplus-2.0.2.tar.gz (278 kB)
Building wheels for collected packages: pydotplus
  Building wheel for pydotplus (setup.py): started
  Building wheel for pydotplus (setup.py): finished with status 'done'
  Created wheel for pydotplus: filename=pydotplus-2.0.2-py3-none-any.whl size=24566 sha256=00a4d97d1fd7702cb799e90d46e0270e32a00a135fb93d23d0a57ce880c63bf2
  Stored in directory: c:\users\dell\appdata\local\pip\cache\wheels\fe\cd\78\a7e873cc049759194f8271f780640cf96b35e5a48bef0e2f36
Successfully built pydotplus
Installing collected packages: pydotplus
Successfully installed pydotplus-2.0.2


In [63]:
clf=DecisionTreeClassifier(criterion='entropy')
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.703125

In [64]:
from sklearn.metrics import mean_squared_error
import math
clf=DecisionTreeClassifier()
clf.fit(X_train, y_train)
ypred=clf.predict(X_test)
asc=accuracy_score(y_test, y_pred)
mse=mean_squared_error(y_test, y_pred)
rmse=math.sqrt(mse)
print(asc,mse,rmse)

0.703125 3.0625 1.75


In [65]:
data.fillna(data.mean(), inplace=True)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.283582,0.475188,0.904523,0.333333,0.521429,0.077295,0.223048,0.000000,0.000000
1,0.220808,0.421053,0.891960,0.389408,0.567857,0.062802,0.218401,0.000000,0.000000
2,0.285777,0.372932,0.927136,0.311526,0.500000,0.091787,0.259294,0.000000,0.000000
3,0.275241,0.381955,0.909548,0.295950,0.583929,0.088567,0.245353,0.000000,0.000000
4,0.211150,0.309774,0.907035,0.414330,0.564286,0.103060,0.245353,0.000000,0.509804
...,...,...,...,...,...,...,...,...,...
208,0.223003,0.512782,0.000000,0.806854,0.500000,0.012882,0.348513,0.336508,0.000000
209,0.250219,0.630075,0.000000,0.529595,0.580357,0.000000,0.276022,0.504762,0.000000
210,0.417032,0.545865,0.000000,0.538941,0.644643,0.000000,0.279740,0.520635,0.000000
211,0.235294,0.548872,0.000000,0.514019,0.678571,0.000000,0.283457,0.498413,0.000000


In [71]:
from sklearn.datasets import load_digits
from sklearn.linear_model import Perceptron
clf=Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.546875

In [72]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[13  4  0  0  0  1]
 [ 4 17  0  2  0  1]
 [ 1  1  2  0  0  0]
 [ 0  2  0  2  0  0]
 [ 0  0  0  0  3  0]
 [ 1  2  0  0  0  8]]
              precision    recall  f1-score   support

           1       0.68      0.72      0.70        18
           2       0.65      0.71      0.68        24
           3       1.00      0.50      0.67         4
           5       0.50      0.50      0.50         4
           6       1.00      1.00      1.00         3
           7       0.80      0.73      0.76        11

    accuracy                           0.70        64
   macro avg       0.77      0.69      0.72        64
weighted avg       0.72      0.70      0.70        64



In [74]:
#using logistic regression
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X_train, y_train)

In [75]:
clf.score(X_test, y_test)

0.5625

In [76]:
# using random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from math import sqrt
clf=RandomForestClassifier(n_estimators=150, max_depth=3, random_state=0)
clf.fit(X_train, y_train)
ypred=clf.predict(X_test)
accuracy_score(y_test, y_pred)
asc=accuracy_score(y_test, y_pred)
mse=mean_squared_error(y_test, y_pred)
rmse=sqrt(mse)
print(asc,mse,rmse)


0.703125 3.0625 1.75


In [90]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import mean_squared_error
from math import sqrt
clf=MultinomialNB()
clf.fit(X_train,y_train)
pred=clf.predict(X_test)
accuracy_score(y_test, pred)
asc=accuracy_score(y_test, y_pred)
mse=mean_squared_error(y_test, y_pred)
rmse=sqrt(mse)
print(asc,mse,rmse)

0.703125 3.0625 1.75


In [89]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
knn=KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)
pred=knn.predict(X_test)
accuracy_score(y_test, pred)
print(classification_report(y_test, pred))
print(confusion_matrix(y_pred,pred))


              precision    recall  f1-score   support

           1       0.58      0.83      0.68        18
           2       0.72      0.75      0.73        24
           3       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         4
           6       0.00      0.00      0.00         3
           7       0.73      0.73      0.73        11

    accuracy                           0.64        64
   macro avg       0.34      0.39      0.36        64
weighted avg       0.56      0.64      0.59        64

[[15  4  0  0  0  0]
 [ 5 18  0  1  0  2]
 [ 1  1  0  0  0  0]
 [ 2  0  0  1  0  1]
 [ 2  1  0  0  0  0]
 [ 1  1  0  0  0  8]]
