In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
data = pd.read_csv('../input/glass/glass.csv')
data.head()

In [None]:
data.info

The data is taken from UCI Repository.

Attribute Information:

RI: refractive index

Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)

Mg: Magnesium

Al: Aluminum

Si: Silicon

K: Potassium

Ca: Calcium

Ba: Barium

Fe: Iron

Objective: To correctly classify/identify the glass type from the attributes given in the feature columns.

It is a multiclass classification problem.

The 7 glass types are as following:
    
    1 buildingwindowsfloatprocessed
    2 buildingwindowsnonfloatprocessed  
    3 vehiclewindowsfloatprocessed
    4 vehiclewindowsnonfloatprocessed (none in this database)
    5 containers
    6 tableware
    7 headlamps
    


# Data Analysis

1) Checking Missing or Null values


In [None]:
data.isna().sum()

2) Checking data distribution

In [None]:
data.describe()

In [None]:
data.shape
#data size is small
data.columns

In [None]:

%matplotlib inline

features=['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe']

for col in features:
    data.boxplot(column=col, by= 'Type',figsize=(6,6))
    plt.title(col)
plt.show()

# Preparing the model (Basic)

In [None]:
label= 'Type'
X , y = data[features].values , data[label].values

In [None]:
from sklearn.model_selection import train_test_split

xtrain, xtest , ytrain , ytest = train_test_split(X , y , test_size = 0.3, random_state = 0, stratify = y)

In [None]:
from sklearn.linear_model import LogisticRegression
 
reg=0.01

lr=LogisticRegression(C = 1/reg ,solver='saga', multi_class= 'auto', max_iter= 800).fit(xtrain, ytrain)
ypred= lr.predict(xtest)

In [None]:
print('Predicted labels: ', ypred[:15])
print('Actual labels   : ' ,ytest[:15])

In [None]:
from sklearn.metrics import classification_report

print(classification_report(ytest, ypred))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Overall Accuracy:",accuracy_score(ytest, ypred))
print("Overall Precision:",precision_score(ytest, ypred, average='macro'))
print("Overall Recall:",recall_score(ytest, ypred, average='macro'))

In [None]:
from sklearn.metrics import confusion_matrix

# Print the confusion matrix
mcm = confusion_matrix(ytest,ypred)

print(mcm)


In [None]:
data['Type'].unique()

Changing to other model: Using multiple models to compare and see which one works fine
AND PREPROCESSING IT TO SCALE VALUES

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

classifiers=[]

svm = SVC(probability=True)
classifiers.append(svm)

rf = RandomForestClassifier(n_estimators=100)
classifiers.append(rf)

knn = KNeighborsClassifier(n_neighbors=5)
classifiers.append(knn)

dt = DecisionTreeClassifier(max_features='auto')
classifiers.append(dt)

gbc = GradientBoostingClassifier(max_features= 'auto')
classifiers.append(gbc
                  )
feat_cols = [1,2,3,4,5,6,7,8]
feature_transformer = Pipeline( steps = [('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers= [('preprocess', feature_transformer, feat_cols)])

for clf in classifiers:
    pipeline= Pipeline(steps=[ ('preprocessor', preprocessor), ('clf', clf) ])
    model=pipeline.fit(xtrain, ytrain)
    ympred= model.predict(xtest)
    print(clf)
    print("Overall Accuracy:",accuracy_score(ytest, ympred))
    print("Overall Precision:",precision_score(ytest, ympred, average='macro'))
    print("Overall Recall:",recall_score(ytest, ympred, average='macro'))
    print("\n")


The highest accuracy & precision has been achieved is by RANDOMFORESTCLASSIFIER
Lets analyze the RandomForest model's results

The confusion matrix shows the intersection of predicted and actual label values for each class - in simple terms, the diagonal intersections from top-left to bottom-right indicate the number of correct predictions.

When dealing with multiple classes, it's generally more intuitive to visualize this as a heat map


In [None]:
pipeline= Pipeline(steps=[ ('preprocessor', preprocessor), ('classifier', RandomForestClassifier(n_estimators=100)) ])
model=pipeline.fit(xtrain, ytrain)
rf_pred= model.predict(xtest)


In [None]:
from sklearn.metrics import confusion_matrix

# Print the confusion matrix
mcm = confusion_matrix(ytest, rf_pred)
print(mcm)

types=['build_windows','buildingwindowsnonfloatprocessed','vehiclewindowsfloatprocessed'
,'vehiclewindows'
,'containers'
,'tableware'
,'headlamps']

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

plt.imshow(mcm, interpolation="nearest", cmap=plt.cm.Blues)
plt.colorbar()
tick_marks = np.arange(len(types))
plt.xticks(tick_marks, types, rotation=45)
plt.yticks(tick_marks, types)
plt.xlabel("Predicted Type")
plt.ylabel("Actual Type")
plt.show()

The darker squares in the confusion matrix plot indicate high numbers of cases, and you can hopefully see a diagonal line of darker squares indicating cases where the predicted and actual label are the same.