# Glass Classification

**Data Description**

1) RI: refractive index

2) Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)

3) Mg: Magnesium

4) Al: Aluminum

5) Si: Silicon

6) K: Potassium

7) Ca: Calcium

8) Ba: Barium

9) Fe: Iron

**Type of glass: (class attribute)**

1) buildingwindowsfloatprocessed

2) buildingwindowsnonfloatprocessed 

3) vehiclewindowsfloatprocessed

4) vehiclewindowsnonfloatprocessed (none in this database)

5) containers

6) tableware

7) headlamps

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
np.random.seed(43)

In [None]:
plt.style.use("ggplot")

In [None]:
df = pd.read_csv("../input/glass/glass.csv")

In [None]:
df.head()

In [None]:
df.describe().T

In [None]:
from pandas.plotting import parallel_coordinates

In [None]:
plt.figure(figsize=(10,8))
parallel_coordinates(df, "Type")

In [None]:
import missingno as msno

In [None]:
msno.bar(df)

In [None]:
sns.countplot(df["Type"])

In [None]:
print(df.groupby("Type").min())
print("------------------------------------------------------------------")
print(df.groupby("Type").max())

In [None]:
col = df.columns

In [None]:
fig, ax = plt.subplots(3,3, figsize=(10,10))
axes_all = [axes for axes_row in ax for axes in axes_row]
for i, c in enumerate(df[col]):
    if c  == "Type":
        break
    else:
        sns.boxplot(df[c], data = df, ax = axes_all[i])

In [None]:
#Detecting the no of outliers present in our data

def outlier_func(data, col):
    
    Q1 = df.quantile(q = 0.25, axis = 0)
    Q3 = df.quantile(q = 0.70, axis = 0)
    IQR = Q3-Q1
    
    min_val = Q1 - 1.5*IQR
    max_val = Q3 + 1.5*IQR
    
    df1 = df[df[col] <=  min_val[col]].shape[0]
    df2 = df[df[col] >=  max_val[col]].shape[0]
    
    print(f"There are {df1 + df2} total number of outliers in which {df1} datapoints are below or equals to the Q1 Deviation {Q1[col]} and {df2} are above or equal to the {Q3[col]}\n")
    print(f"The IQR of '{col}' is: {IQR[col]}")
    
    print(f"The Q1 Deviation of '{col}' is: {Q1[col]}")
    print(f"The Q3 Deviation of '{col}' is: {Q3[col]} \n")
    
    print("The min value is:", min(data[col]))
    print("The max value is:", max(data[col]), "\n")
    
    print("The skewness is: ",scipy.stats.skew(data[col]))
    print("The Kurtosis is: ",scipy.stats.kurtosis(data[col]))
    
    
    
    #Also returning the visual representation of the outlier
    
    plt.figure(figsize=(8,6))
    sns.distplot(data[col], color = 'g')
    plt.axvline(df[col].mean(), linestyle = '--', color = 'k')
    plt.axvline(df[col].median(), linestyle = '--', color = 'orange')
    
    plt.axvspan(xmin = Q1[col], xmax=data[col].min(), alpha = 0.15, color = 'r')
    plt.axvspan(xmin = Q3[col], xmax=data[col].max(), alpha = 0.15, color = 'r')
    
    plt.legend(["Mean", "Median","Outlier Bound"])

In [None]:
outlier_func(df, "RI")

In [None]:
df["Ba"].value_counts()

In [None]:
df.drop(labels = 'Ba', axis = 1, inplace = True) #Lets remove the 'Ba' from our datset as most of the values are 0

In [None]:
df

In [None]:
corrmat = df.corr()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(corrmat, annot = True, cmap = 'Blues')

In [None]:
df.drop(labels = "RI", axis = 1, inplace = True)

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), annot = True, cmap = 'Blues')

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier

In [None]:
X = df.iloc[:, :7]

In [None]:
y = df.iloc[:, 7]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2, random_state = 0)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
test_score = []
train_score = []

In [None]:
for i in range(1,16):
    knn = KNeighborsClassifier(i)
    knn.fit(X_train, y_train)
    
    train_score.append(knn.score(X_train, y_train))
    test_score.append(knn.score(X_test, y_test))

In [None]:
plt.plot(range(1,16), train_score)
plt.plot(range(1,16), test_score)

As per the above plot we can see that the 1 neighbors will be the ideal but we know if try to use the n with 1 it will overfit our model so we do not want that.

In [None]:
#lets try to standardize the data and then try
#We will also on additional parameter to 
from sklearn.preprocessing import StandardScaler

In [None]:
se = StandardScaler()

In [None]:
X_col = X.columns

In [None]:
X_std = se.fit_transform(X)

In [None]:
X_std = pd.DataFrame(X_std, columns = X_col)

In [None]:
X_std.mean()

In [None]:
X_std.std()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y , test_size = 0.2, random_state = 0)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
test_score = []
train_score = []

In [None]:
for i in range(1,16):
    knn = KNeighborsClassifier(i)
    knn.fit(X_train, y_train)
    
    train_score.append(knn.score(X_train, y_train))
    test_score.append(knn.score(X_test, y_test))

In [None]:
plt.plot(range(1,16), train_score)
plt.plot(range(1,16), test_score, linestyle = '--', marker = '*')

In [None]:
knn = KNeighborsClassifier(4)

In [None]:
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

In [None]:
print(accuracy_score(y_test, y_pred) * 100)

In [None]:
sns.countplot(y_pred)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y , test_size = 0.2, random_state = 0, stratify = y)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
neighbors = [x for x in range(1,16)]

In [None]:
cross_score = []
for k in neighbors:
    KNN = KNeighborsClassifier(n_neighbors = k)
    scores = cross_val_score(KNN, X, y, cv = 10, scoring = 'accuracy')
    cross_score.append(scores.mean())

In [None]:
MSE = [1-x for x in cross_score]

In [None]:
plt.plot(neighbors, MSE)
plt.xlabel("K Neighbors")
plt.ylabel("Error")
plt.show()

In [None]:
#So based on this, our best value for k is 3
KNN_Model = KNeighborsClassifier(n_neighbors = 3)
KNN_Model_Fit = KNN_Model.fit(X_train, y_train)
KNN_Model_Predict = KNN_Model_Fit.predict(X_test)

In [None]:
print(round(accuracy_score(y_test, KNN_Model_Predict) * 100),"%")

In [None]:
print(classification_report(y_test, KNN_Model_Predict))

In [None]:
sns.countplot(y_pred)

In [None]:
#Lets Try to Standardize the data after splitting the data and see if our accuracy improved or not
#By normalizing or standardizing the data after splitting, means we can avoid the issue of data leakage

In [None]:
X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(X, y, test_size = 0.2, random_state =0, stratify=y)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
#Now try to standardize the data
se_new = StandardScaler()

In [None]:
X_train_std = se_new.fit_transform(X_train_n)
X_test_std = se_new.fit_transform(X_test_n)

In [None]:
#So based on this, our best value for k is 3
KNN_Model = KNeighborsClassifier(n_neighbors = 3)
KNN_Model_Fit = KNN_Model.fit(X_train_std, y_train)
KNN_Model_Predict = KNN_Model_Fit.predict(X_test_std)

In [None]:
print(round(accuracy_score(y_test, KNN_Model_Predict) * 100),"%")

In [None]:
print(classification_report(y_test, KNN_Model_Predict))

In [None]:
sns.countplot(y_pred)

Our accuracy score is not good because of the class imbalance and to fix this we can use some sampling techniques to tackle the class imbalance issue which I will be trying soon.

If you like this kernel, please do upvote. I am new to Machine Learning so if you have some feedbacks, please feel free to share with me.