In [None]:
import numpy as np
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#SMOTE 
import imblearn
from imblearn.over_sampling import SMOTE

#LDA 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler


from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings('ignore')

pd.options.mode.chained_assignment = None

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
data = pd.read_csv('../input/glass/glass.csv')
data.head()

In [None]:
#get information about the data types
data.info()

### **Checking null values**
Missing values in the dataset can cause problems. Before classification, the missing values problem should be solved.
There is no null value in the dataset.

In [None]:
data.isnull().sum()

The statistical results of the data can help us make some inferences about the dataset.
With describe() method, many information such as the mean values of the data, standard deviation values etc are displayed. 

For example it is seen that the highest mean value is in Si feature.

In [None]:
data.describe()

### **Checking duplicate values**
Repeated data may prevent the model from producing correct results.
duplicated() method, it is checked whether there is a repeating value in the dataset. 


In [None]:
dups = data.duplicated()
print('Number of duplicate rows: %d' % dups.sum())

In [None]:
#drop duplicated value
print('Number of rows before discarding duplicates = %d' % data.shape[0])

data2 = data.drop_duplicates()
print('Number of rows after discarding duplicates = %d' % data2.shape[0])

### **Checking dataset imbalanced**

The sets of data in which classes are not evenly distributed, approximately the same number for its class are called imbalanced datasets.The high accuracy value of the model can be caused by a certain class.

value_counts() method shows how many samples it is for the glass type. Type 2 and Type 1 have more samples than other types.

In [None]:
data2.Type.value_counts()

In [None]:
sns.set(style = 'whitegrid', font_scale = 1.8)
plt.subplots(figsize = (12,7))
sns.countplot(x = 'Type', data = data2, palette = 'Pastel1')

### **Checking outliers**

In [None]:
target_class_name = 'Type'
features = ['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe']

sns.set(style = 'whitegrid', font_scale = 1.2, palette = 'Paired')
plt.subplots(figsize = (20,15))

for i in range(1,10):
    plt.subplot(3,3,i)
    sns.boxplot(x = target_class_name, y = features[i-1], data = data2)


### **correlation coefficients**
Correlation is the relationship between two variables. Values above 0.70, 0.80 indicate a high correlation. Finding highly correlated features in the dataset can be misleading for the model. 
Attention should be paid to feature selection while applying the model.


There is a high correlation of 0.81 between Ca and RI in the dataset.
##### Ba - Al -> 0.48
##### Ba - Na -> 0.33
There is a negative correlation between Si and RI with a value of -0.54.

In [None]:
correlation = data2[features].corr()
mask = np.zeros_like(correlation)
mask[np.triu_indices_from(mask)] = True
plt.subplots(figsize = (10, 10))
sns.heatmap(correlation, mask = mask, cmap = 'YlGnBu', annot = True, linewidth = .5, square = True)

It is set to be 80% training data and 20% test data.

In [None]:
X = pd.DataFrame(data2.drop(["Type"], axis = 1),
            columns=['RI','Na','Mg','Al','Si','K','Ca','Ba','Fe'])
y = data2.Type

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 40, stratify = y)

### K-Nearest Neighbors Classification
It is a widely used supervised machine learning algorithm. In the work of the algorithm, a value of k is determined, the meaning of k value is the number of elements to look at. When a value comes, the distance between the value is calculated by taking the nearest k elements. Euclidean, Manhattan, Minkowski and Hamming functions are used for distance calculation. After the distances are calculated, they are sorted and the corresponding value is assigned to the appropriate class.

With the help of GridSearchCV, after calculating the best value in the range of (1, 50) of the number of neighbors to be given to the model, the accuracy value was calculated as 0.7674.

In [None]:
knn = KNeighborsClassifier()
knn_params = {'n_neighbors': np.arange(1,50)}
knn_cv_model = GridSearchCV(knn, knn_params, cv = 10).fit(X_train, y_train)

n_neig = knn_cv_model.best_params_['n_neighbors']

In [None]:
knn_tuned = KNeighborsClassifier(n_neighbors = n_neig).fit(X_train, y_train)
y_pred = knn_tuned.predict(X_test)
knn_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy score for KNN: {}'.format(knn_accuracy))

Accuracy is the number of correctly predicted data points out of all the data points.
As a performance measure, accuracy is inappropriate for imbalanced classification problems. Therefore, it is necessary to evaluate the results of other metrics together.

The confusion matrix that enables the calculation of the metrics that measure the success of the classification models was created.

In [None]:
#Confusion Matrix of KNN
sns.heatmap(confusion_matrix(y_test, y_pred), annot = True, lw = 2, cbar = False, cmap = 'GnBu')
plt.ylabel('True values')
plt.xlabel('Predicted values')
plt.title('Confusion Matrix')
plt.show()

An alternative to using classification accuracy is to use precision and recall metrics.

##### -> Precision quantifies the number of positive class predictions that actually belong to the positive class.
##### -> Recall quantifies the number of positive class predictions made out of all positive examples in the dataset.
##### -> F-Measure provides a single score that balances both the concerns of precision and recall in one number.

In [None]:
#Classification report for KNN
print(classification_report(y_test, y_pred))

### SMOTE (Synthetic Minority Over-sampling Technique)

One of the methods that can be used to eliminate the imbalance in the data set is resampling. It has two methods:

The first method is to increase the data belonging to the minority classes to obtain classes with an equal number of data. It is called oversampling.

Another method is to obtain a balanced data set by extracting the data belonging to the weighted class from the data set. It is called undersampling.


The oversampling method was preferred because the number of samples in the data set used is small and the undersampling process will cause data loss. In order to apply this method, the SMOTE technique in imbalanced-learn library was used and synthetic data belonging to minority classes were produced.


In [None]:
sns.set(style = 'whitegrid', font_scale = 1.8)
plt.subplots(figsize = (12,7))
sns.countplot(x = y, palette = 'Pastel1').set_title('Before SMOTE')

In [None]:
sm = SMOTE(sampling_strategy = 'not majority', random_state = 42)
x_res, y_res = sm.fit_resample(X, y)
y_res_df = pd.DataFrame(y_res)


sns.set(style = 'whitegrid', font_scale = 1.8)
plt.subplots(figsize = (12,7))
sns.countplot(x = y_res_df.Type, palette = 'Pastel1').set_title('After SMOTE')


In [None]:
print(x_res.shape)
print(y_res.shape)

y_res.value_counts()

#### Standard Scaler

Situations such as the distribution of data and the scale differences between features are factors that affect the operation of algorithms. In this case, pulling the features into a common data range enables us to obtain more accurate results.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_res, y_res, test_size = .2, random_state = 40, stratify = y_res)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


### LDA (Linear Discriminant Analysis)
It is used as a size reduction technique. It reduces the size of the data set, maximizing the difference between classes. The goal is to prevent overfitting and reduce computational costs.

In the data set, 9 attributes were reduced to 5 significant components with the help of LDA.

In [None]:
lda = LDA(n_components = 5)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)

print(X_train.shape)
print(X_test.shape)

### KNN Classification again

In [None]:
knn = KNeighborsClassifier()
knn_params = {'n_neighbors': np.arange(1,50)}
knn_cv_model = GridSearchCV(knn, knn_params, cv = 10).fit(X_train, y_train)
n_neighbor = knn_cv_model.best_params_['n_neighbors']

In [None]:
knn_tuned = KNeighborsClassifier(n_neighbors = n_neighbor).fit(X_train,y_train)
knn_y_pred = knn_tuned.predict(X_test)
knn_acc = accuracy_score(y_test, knn_y_pred)
print('Accuracy score for KNN: {}'.format(knn_acc))

In [None]:
#Confusion Matrix of KNN
sns.heatmap(confusion_matrix(y_test, knn_y_pred), annot = True, lw = 2, cbar = False, cmap = 'GnBu')
plt.ylabel('True values')
plt.xlabel('Predicted values')
plt.title('Confusion Matrix')
plt.show()

In [None]:
#Classification report for KNN
print(classification_report(y_test, knn_y_pred))