# Problem statement

Prepare a model for glass classification using KNN

# Importing the libraries

In [None]:
from pandas import read_csv
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sn
import pandas as pd
from sklearn.model_selection import GridSearchCV,train_test_split
import numpy as np
import imblearn
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

# Loading the dataset

In [None]:
glass = pd.read_csv("../input/glass/glass.csv")

In [None]:
glass.head()

We can just peek into few data points by using head function of pandas. By default, head function return top 5 values 

# Data Insights

In [None]:
glass.shape

In [None]:
glass.info()

### Observations :-

##### We could see that there are no null values in our dataset

In [None]:
dups = glass.duplicated()
print('Number of duplicate rows: %d' % dups.sum())

There is one duplicate row, we will delete the duplicate row.

In [None]:
print('Number of rows before discarding duplicates = %d' % glass.shape[0])

glass = glass.drop_duplicates()
print('Number of rows after discarding duplicates = %d' % glass.shape[0])

# Summary statistics

In [None]:
glass.describe()

### Observations :-

1. We could see there is a lot of difference between 50% (percentile) value and the max value for K(Potassium) and Ca(Calcium). So there is a chance of having an outlier in these 2 columns. We will further check using boxplots
2. We can see the min,max and standard deviations including 25,50 and 75 percentile values.

# Understanding the target variable

##### Our main objective is to classify the type of glass based on weight percentage. We have a column Type which has the values from 1 to 7 which is to determine the glass type



##### value_counts() method shows how many samples it is for the glass type. 

In [None]:
glass['Type'].value_counts()

##### We can see the most frequent type of glasses are type 2 and type 1 in our data with the maximum value of 76 and 70. We can see the value counts of all the types. The data is imbalanced. The sets of data in which classes are not evenly distributed are called imbalanced datasets.The imbalance dataset can cause high/low accuracy value of the model due to a certain class.

In [None]:
sn.set(style = 'whitegrid', font_scale = 1.4)
plt.subplots(figsize = (12,7))
sn.countplot(x = 'Type', data = glass, palette = 'Pastel1')

# Data visualization

In [None]:
sn.boxplot(glass['RI'])

In [None]:
sn.boxplot(glass['Na'])

In [None]:
sn.boxplot(glass['Mg'])

In [None]:
sn.boxplot(glass['Al'])

In [None]:
sn.boxplot(glass['Si'])

In [None]:
sn.boxplot(glass['K'])

In [None]:
sn.boxplot(glass['Ca'])

In [None]:
sn.boxplot(glass['Ba'])

In [None]:
sn.boxplot(glass['Fe'])

In [None]:
sn.boxplot(glass['Type'])

### Observations :-

1. We can see there are outliers in all the columns except Mg. The Type column is also having outlier as we already saw the value counts are very less for type 7
2. The median line for Mg and K is towards the upper quartile which means the data is skewed.We will check the distplots and confirm the same.

### Distplots

Dist plots are used to check the distribution of the data, peak value(the observation having the highest frequecy) and check for skewness in the data

In [None]:
sn.distplot(glass['RI'])

In [None]:
sn.distplot(glass['Na'])

In [None]:
sn.distplot(glass['Mg'])

In [None]:
sn.distplot(glass['Al'])

In [None]:
sn.distplot(glass['Si'])

In [None]:
sn.distplot(glass['K'])

In [None]:
sn.distplot(glass['Ca'])

In [None]:
sn.distplot(glass['Ba'])

In [None]:
sn.distplot(glass['Fe'])

In [None]:
sn.distplot(glass['Type'])

### Observations :-

1. There is very high positive skewness in RI, Na, Al, K, Ca, Ba, Fe and Type
2. There is negative skewness in Mg and Si

# Heatmap

##### Heatmap is a very effective technique to check the missing values in the dataset and to also understand if there is any correlation between the features of the data

In [None]:
Y = 'Type'
X = ['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe']


sn.heatmap(glass[X].isnull())

### Observations :-

1. We don't have any missing values in our dataset. If it was present, there would be a different colour shade appearing on the red background. 

In [None]:
sn.heatmap(glass[X].corr())

In [None]:
glass[X].corr()

# Separating feature data and Label data  and train-test split

In [None]:
X = pd.DataFrame(glass.drop(["Type"], axis = 1),
            columns=['RI','Na','Mg','Al','Si','K','Ca','Ba','Fe'])
Y = glass.Type

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, random_state = 30, stratify = Y)


In [None]:
X_train

In [None]:
Y_train

In [None]:
X_test

In [None]:
Y_test

# Grid Search for Algorithm Tuning

In [None]:
n_neighbors = np.array(range(1,40))
param_grid = dict(n_neighbors=n_neighbors)

In [None]:
param_grid

In [None]:
model = KNeighborsClassifier()
grid = GridSearchCV(estimator=model, param_grid=param_grid,cv=10)
grid.fit(X_train, Y_train)
print(grid.best_params_)

##### After applying GridSearch, we got the best K (n_neighbors) value as 1, so we will be using the k= 1 for KNN Classifier algorithm

### Visualizing the CV results

In [None]:
import matplotlib.pyplot as plt 
%matplotlib inline
# choose k between 1 to 41
k_range = range(1, 41)
k_scores = []
# use iteration to caclulator different k in models, then return the average accuracy based on the cross validation
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, Y_train, cv=10)
    k_scores.append(scores.mean())
# plot to see clearly
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

##### We could see that the model accuracy is very good for k values smaller than 5 and as the value increases the accuracy goes on decreasing

# Using KNN Classifier for prediction

In [None]:
model = KNeighborsClassifier(n_neighbors =1).fit(X_train,Y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(Y_test,y_pred)
print(accuracy)

##### We can see that the accuracy score which we have got for our model is 0.76 which is 76%. It is decent accuracy score. But the accuracy score can be misleading for imbalanced data. So we will use confusion matrix and classification report metrics further

In [None]:
confusion_matrix = confusion_matrix(Y_test,y_pred)
print (confusion_matrix)

In [None]:
print(classification_report(Y_test,y_pred))

##### The precison,recall and f1 score for type 3 is very low. For type 1, the precison is low but recall and f1-score is good. Since the data is imbalanced, we can see the precision,recall values are affected. We will use oversamping technique as the data is very less and undersampling will cause data loss

# Using Over Sampling for balancing the data

##### We will use SMOTE over sampling technique for oversampling the data

In [None]:
sm = SMOTE(sampling_strategy = 'not majority', random_state = 42)

In [None]:
x_resample, y_resample = sm.fit_resample(X, Y)
y_df = pd.DataFrame(y_resample)

In [None]:
y_df.value_counts()

##### We could see the data is resampled now and all the type values are 76  now. Previously  type 1 and type 2  were having values 69 and 76 respectively, and other counts were very low. We will split the resampled data into training and test data and build a KNN model 

##### We will apply Standardization to make the scale free and to make data  consistent

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x_resample, y_resample, test_size = .2, random_state = 40, stratify = y_resample)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


# Using GridSearch for Algorithm Tuning after resampling

In [None]:
n_neighbors = np.array(range(1,40))
param_grid = dict(n_neighbors=n_neighbors)

model = KNeighborsClassifier()
grid = GridSearchCV(estimator=model, param_grid=param_grid,cv=10)
grid.fit(X_train, Y_train)
print(grid.best_params_)

##### After applying GridSearch, we got the best K (n_neighbors) value as 1, so we will be using the k= 1 for KNN Classifier algorithm

### Visualizing the accuracy with different k values on sampled data

In [None]:
import matplotlib.pyplot as plt 
%matplotlib inline
# choose k between 1 to 41
k_range = range(1, 41)
k_scores = []
# use iteration to caclulator different k in models, then return the average accuracy based on the cross validation
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, Y_train, cv=10)
    k_scores.append(scores.mean())
# plot to see clearly
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

##### The accuracy value is high for low values of k (less than 5) and it descreases as we increase values of k

# Using KNN with k=1 for model classification 

##### We had identified the k=1 is best parameter with GridSearch so using k as 1

In [None]:
model = KNeighborsClassifier(n_neighbors =1).fit(X_train,Y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(Y_test,y_pred)
print(accuracy)

##### The accuracy is 0.89 which is 89% after applying sampling. But we will use confusion matrix and classification report to further check our accuracy

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test,y_pred)
confusion_matrix

In [None]:
print(classification_report(Y_test,y_pred))

##### We could see the precision and recall values are more than 0.75 for all the 7 types which is very decent score. 