# ***Glass Classification by Machine learning***

In [None]:
from PIL import ImageTk, Image  
image = Image.open("../input/glassss/Untitledw21.png")
image

# **Part 1 -: Importing Libraries and Data**# 

In [None]:
import pandas as pd # Importing pandas for data manipulation and analysis
import numpy as np # Importing python linear algebra library to do work with arrays
from sklearn.model_selection import train_test_split # to split the data into train and test sets
import seaborn as sns # to import a python library to create alluring and communicative plots and graphs
import matplotlib.pyplot as plt # to import the ploting library of python language
from sklearn.preprocessing import StandardScaler #Importing the Standard Sclaer from sklearn.preprocessing
from sklearn.model_selection import GridSearchCV # Import grid search to choose the best parameters of model
import warnings # to import warnings
warnings.filterwarnings('ignore') # to import warnings as 'ignore'
from sklearn.metrics import accuracy_score # Import accuracy score function of python that helps to evaluate models

In [None]:
df=pd.read_csv("../input/glass/glass.csv") # to import csv file as data frame
print(df) # to see the dataframe

# **Part 2 -: Data Preprocessing**# 

##  *i)Checking for missing data*

In [None]:
#Checking For Null values in our datasets and then removing the same.
pd.DataFrame(df.isna().sum()) #This will give the snapshot if me have any null values in our dataset.


> ### Analysis -: Data contains no missing values, which is a good thing !

 ## **ii)Checking Inconsistency in the data values**

In [None]:
df.info()

> ### Analysis -: As we can see frome the above output, that the data is pretty much consistence as data type of all values of a particular feature is similar!
> ### For example, all the values in "Type" column have int64 as data type.

 ## **iii)Checking duplicate values**

In [None]:
# In order to find the row that has occured more than once in a dataset
duplicate_rows = df[df.duplicated()]
print(duplicate_rows)

In [None]:
df.drop_duplicates() # drop duplicate values

> ### Analysis -: It is important to remove duplicate rows in order to avoid bais in dataset. As our dataset contains some duplicate rows, therefore, I have removed those rows to abstain any partiality in data

# **Part 3 -: Exploratory Data Analysis**# 

 ## **i)Univariant Analysis of Numerical Features**

In [None]:
# visualizing numeric variables using seaborn
sns.set(font_scale=1.5)
sns.set_style(style='darkgrid')
f, axes = plt.subplots(3,3,figsize=(25,25))
sns.distplot( df["RI"] ,hist_kws=dict(edgecolor="k", linewidth=2, color="#0000ff"),color="black", ax=axes[0, 0])
sns.distplot( df["Na"] ,hist_kws=dict(edgecolor="k", linewidth=2, color="#00cc00"),color="black", ax=axes[0, 1])
sns.distplot( df["Mg"] ,hist_kws=dict(edgecolor="k", linewidth=2, color="#e68a00"),color="black", ax=axes[0, 2])
sns.distplot( df["Al"] , hist_kws=dict(edgecolor="k", linewidth=2,color="#992600"),color="black", ax=axes[1, 0])
sns.distplot( df["Si"] ,hist_kws=dict(edgecolor="k", linewidth=2, color="#e600ac"),color="black", ax=axes[1, 1])
sns.distplot( df["K"] ,hist_kws=dict(edgecolor="k", linewidth=2, color="skyblue"),color="black", ax=axes[1, 2])
sns.distplot( df["Ca"] ,hist_kws=dict(edgecolor="k", linewidth=2,color='orange'), color="black", ax=axes[2, 0])
df['Ba'].plot.hist(color=['olive'],edgecolor="k", linewidth=2,ax=axes[2, 1],title='Ba')
sns.distplot( df["Fe"] ,hist_kws=dict(edgecolor="k", linewidth=2, color="red"),color="black", ax=axes[2, 2])

> ### Analysis -:i) RI -: The graph shows multimodal and right skewed distribution. It depicts that the value of refractive index for most of samples of different elements is in between 1.515 to 1.520.
> ### ii)Na,Al and Fe has Unimodal distribution with the highest values lying in the range of 12 - 14, 1 - 2, -0.1 to 0.1 respectively.
> ### iii)However,Ca,K,Si,Mg have bimodal distribution.Whereas, distribution of Ba is right skewed.

 ## **ii)Univariant Analysis of Categorical Features**

In [None]:
# visualizing categorical features
df['Type'].value_counts().plot.bar(color=['olive','skyblue','red','orange','pink','blue'],title='Type of glass',edgecolor="k", linewidth=2)

> ### Analysis -:According to the above output, 2nd type of glass has occured most frequently in the dataset.

 ## **iii)Multivariant analysis of all the features**

In [None]:
# create correlation matrix
corrMatrix = round(df.corr(),1)
plt.figure(figsize=(16,11))
# to plot the matrix as heat map
sns.heatmap(corrMatrix,annot=True,cmap='seismic',linewidths=2,linecolor='black')
plt.title("Heatmap Correlation of Heart Failure Prediction", fontsize = 23)
plt.show()

> ### Analysis -:According to the correlation matrix, we can find out some interesting facts :
> ### i) Calcium and refractive index are positively correlated to each other,which means on increasing the value of one, other will increase somewhat linearly.
> ### ii) However, Type and Mg are negatively correlated to each other which demonstrate that both are nearly inversely proportional to each other.

# **Part 4 -: Splitting and Scaling of Data**# 

In [None]:
X = df.drop('Type', axis=1).values #Feature datasets for the purpose of calculation.
y = df['Type'].values #Target data sets for the purpose of calculations.
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.85, random_state=42,shuffle=True)#Splitting the data into train and test sets.
pd.DataFrame(X_train) # to have a look at trained dataset

In [None]:
#creating an object of Scaler
scaler = StandardScaler()

#Fitting the training features
scaler.fit(X_train)

#transforming the train features
X_train_scaled = scaler.transform(X_train)

#transforming the test features
X_test_scaled = scaler.transform(X_test)

# **Part 5 -: Models**#

In [None]:
# create two separate train and test lists
train_accuracies=[]
test_accuracies=[]

 ## **i)Support Vector Classification (SVC) Model**

In [None]:
# Part 1 -: SELECT THE BEST HYPERPARMETERS WITH HELP OF GRID SEARCH
from sklearn.svm import SVC 
# defining parameter range 
Hyper_parameters = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  
GridSearch_svc = GridSearchCV(estimator = SVC(),
                               param_grid = Hyper_parameters,
                               cv = 15,
                               n_jobs = -1)
GridSearch_svc.fit(X_train_scaled, y_train)

print("Best hyperparameters for model:"+str(GridSearch_svc.best_params_))
print("Best estimator for model:"+str(GridSearch_svc.best_estimator_))

In [None]:
#Part 2 -: Build a model with the help of best parameters
svc = SVC(C=100, gamma=0.1,kernel='rbf',probability=True)
svc.fit(X_train_scaled, y_train)
# to predict the target values
pred_svc_test = svc.predict(X_test_scaled)
pred_svc_train = svc.predict(X_train_scaled)
train_accuracy_svc=accuracy_score(y_train,pred_svc_train)*100
test_accuracy_svc=accuracy_score(y_test,pred_svc_test)*100
train_accuracies.append(train_accuracy_svc)
test_accuracies.append(test_accuracy_svc)
# to find the accuracy of the model on training and testing data
print("Accuracy on  Train data : {}".format(accuracy_score(y_train,pred_svc_train)*100) )
print("Accuracy on  TEST data : {}".format(accuracy_score(y_test,pred_svc_test)*100) )

 ## **ii)Random Forest Classifier Model**

In [None]:
# Part 1 -: Hypertuning of parameters
from sklearn.ensemble import RandomForestClassifier
#The structure that Scikit-learn needs to run Grid search
param_grid={'max_depth':[3,4,5],
           'max_leaf_nodes':[10,15,20],
            'min_samples_leaf':[10,15,20,25]}
from sklearn.model_selection import GridSearchCV
#applying GridSearch on a Decisiontree classifier with a 3 different parameters:
grid_search = GridSearchCV(RandomForestClassifier(n_estimators=11,random_state=573),param_grid,cv=10,return_train_score=True)
grid_search.fit(X_train_scaled,y_train)
print("Best parameters:"+str(grid_search.best_params_))
print("Best estimator:"+str(grid_search.best_estimator_))

In [None]:
#Part 2 -: Build a model with the help of best parameters
rf = RandomForestClassifier(max_depth=5, max_leaf_nodes=10, min_samples_leaf=10,
                       n_estimators=11, random_state=573)
rf.fit(X_train_scaled, y_train)
# to predict the target values
pred_rf_test = rf.predict(X_test_scaled)
pred_rf_train = rf.predict(X_train_scaled)
train_accuracy_rf=accuracy_score(y_train,pred_rf_train)*100
test_accuracy_rf=accuracy_score(y_test,pred_rf_test)*100
train_accuracies.append(train_accuracy_rf)
test_accuracies.append(test_accuracy_rf)
# to find the accuracy of the model on training and testing data
print("Accuracy on  Train data : {}".format(accuracy_score(y_train,pred_rf_train)*100) )
print("Accuracy on  TEST data : {}".format(accuracy_score(y_test,pred_rf_test)*100) )

 ## **iii)XG Boost Classifier Model**

In [None]:
# Part 1 -: Hypertuning of parameters
#XGBoost
from xgboost import XGBClassifier
xg=XGBClassifier(random_state=573)

#List Hyperparameters that we want to tune.

parameter_grid_xg={'learning_rate':[0.05, 0.10, 0.15, 0.20],'max_depth':[3,4,5],'gamma':[ 0.0, 0.1, 0.2 , 0.3]}
gridsearch_xg = GridSearchCV(xg, parameter_grid_xg,cv=15)
gridsearch_xg.fit(X_train_scaled, y_train);

#Get best hyperparameters
gridsearch_xg.best_params_

In [None]:
#Part 2 -: Build a model with the help of best parameters
xg =XGBClassifier(gamma=0.2,learning_rate=0.05,max_depth=4,random_state=573)
xg.fit(X_train_scaled, y_train)
# to predict the target values
pred_xg_test = xg.predict(X_test_scaled)
pred_xg_train = xg.predict(X_train_scaled)
train_accuracy_xg=accuracy_score(y_train,pred_xg_train)*100
test_accuracy_xg=accuracy_score(y_test,pred_xg_test)*100
train_accuracies.append(train_accuracy_xg)
test_accuracies.append(test_accuracy_xg)
# to find the accuracy of the model on training and testing data
print("Accuracy on  Train data : {}".format(accuracy_score(y_train,pred_xg_train)*100) )
print("Accuracy on  TEST data : {}".format(accuracy_score(y_test,pred_xg_test)*100) )

# **Part 6 -: Compare the models**#

In [None]:
# create a list of labels of build models
label = ['SVC','Random Forest','XG Boost']
print(label)

#checking the train and test accuracies for all the parameter values
train_accuracy = [round(num, 2) for num in train_accuracies]
print("Train Accuracies "+str(train_accuracy))

test_accuracy = [round(num, 2) for num in test_accuracies]
print("\nTest Accuracies "+str(test_accuracy))

In [None]:
#Accuracy dataframe
Acc_df = pd.DataFrame({'Model':label,'Train Accuracy(%)': train_accuracy,'Test Accuracy(%)': test_accuracy})
# Plot the heat map for the dataframe
Acc_df.style.background_gradient(cmap='Blues')

> ### Analysis -: As per above output, I choose XG Boost as the best model as it's Train and Test accuracies are more than other models.