# Models

# GLASS CLASSIFICATION 

> - 1.0 Introduction
    - 1.1 Importing libraries
    - 1.2 Collecting the data
  
> - 2.0 Preprocessing
    - 2.1 Dropping irrelevant features
    - 2.2 Outliers Handling
    - 2.3 Checking Class Imbalance
   
> - 3.0 Model Training
     - 3.0.1 Standardizing the Data
- 3.1 Logistic Regression
- 3.2 Naive
- 3.3 Random Forest
- 3.4 Extra Tree
- 3.5 Decision Tree
- 3.6 XGBoost
- 3.7 Gradient Boosting
- 3.8 KNN
- 3.9 Support Vector
- 3.10 Ada Boost
- 3.11 Ensambling: GradientBoost and Random Forest

> - 4.0 Final Result


# 1.0 Introduction

> In this dataset we intend to classify the types of glass as per it's chemical composition

## 1.1 Importing libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from collections import Counter
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline 
from time import time
import warnings

In [None]:
# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import (XGBClassifier, plot_importance)
from sklearn.svm import SVC
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
%matplotlib inline 
warnings.filterwarnings('ignore')

## 1.2 Understanding the Data

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/glass/glass.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

# 2.0 Preprocessing

## 2.1 Dropping irrelevant features

In [None]:
df1 = df.copy()

## 2.2 Outliers Handling

In [None]:
features = ['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe']

In [None]:
sns.set(style='darkgrid')
for feat in features:
    ax = sns.boxplot(x=df1[feat])
    #plt.legend(loc='best')
    plt.show()

### Function for showing observations containing more than 2 outliers

In [None]:
# Detect observations with more than one outlier

def outlier_hunt(df):
    outlier_indices = []
    
    # iterate over features(columns)
    for col in df.columns.tolist():
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        
        # Interquartile rrange (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > 2 )
    
    return multiple_outliers   

print('The dataset contains %d observations with more than 2 outliers' %(len(outlier_hunt(df1[features]))))  

## 2.3 Check Class Imbalance

In [None]:
X = df1.drop(['Type'], axis=1)
y = df1['Type']

In [None]:
sns.countplot(df['Type'])
plt.show()

We can see here that the data is very imbalanced.

## Splitting the Data

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.25, random_state=1)
print(X_train.shape, "\n", X_test.shape)

As noted above that the data is very imbalanced, we'd be using a technique called SMOTE for Imbalance Classification. This technique has increases the accuracy by around 10%. 
For more on SMOTE check out this wonderful article: https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/ 

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 1)
X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train, y_train)
X_train = X_train_oversampled
y_train = y_train_oversampled

In [None]:
pd.Series(y_train).value_counts().sort_index()

In [None]:
X_train_oversampled

In [None]:
y_train

# 3.0 Models

### 3.0.1 Standard Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# 3.1 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(random_state = 0)
lr_clf.fit(X_train, y_train)
print('Accuracy on training set:',lr_clf.score(X_train,y_train))
print('Accuracy on test set:',lr_clf.score(X_test,y_test))

# 3.2 Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
nb_clf= GaussianNB()
nb_clf.fit(X_train, y_train)
print('Accuracy on training set:',nb_clf.score(X_train,y_train))
print('Accuracy on test set:',nb_clf.score(X_test,y_test))

# 3.3 RandomForest Classifier

### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)
rf_random.best_params_

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=2, n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_depth=50, bootstrap=False)
rf_clf.fit(X_train, y_train)
print('Accuracy on training set:',rf_clf.score(X_train,y_train))
print('Accuracy on test set:',rf_clf.score(X_test,y_test))

# 3.4 ExtraTree Classifier

### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

rf = ExtraTreesClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)
rf_random.best_params_

In [None]:
# from sklearn.ensemble import ExtraTreeClassifier
et_clf = RandomForestClassifier(random_state=2, n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_depth=50, bootstrap=False)
et_clf.fit(X_train, y_train)
print('Accuracy on training set:',et_clf.score(X_train,y_train))
print('Accuracy on test set:',et_clf.score(X_test,y_test))

# 3.5 Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dec_clf = DecisionTreeClassifier()
dec_clf.fit(X_train, y_train)
print('Accuracy on training set',dec_clf.score(X_train, y_train))
print('Accuracy on test set:',dec_clf.score(X_test, y_test))

# 3.6 XGBoost

In [None]:
from xgboost import XGBClassifier
xgb_clf= XGBClassifier()
xgb_clf.fit(X_train, y_train)
from sklearn.metrics import accuracy_score
print('Accuracy score on training set: ', accuracy_score(y_true = y_train, y_pred = xgb_clf.predict(X_train).round()))
print('Accuracy score on testing set: ', accuracy_score(y_true = y_test, y_pred = xgb_clf.predict(X_test).round()))

# 3.7 Gradient Boosting

### Hyperparameter Tuning

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Learning rate
learning_rate = [1, 0.5, 0.25, 0.1, 0.05, 0.01]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'learning_rate': learning_rate}
print(random_grid)

gb = GradientBoostingClassifier()
gb_random = RandomizedSearchCV(estimator = gb, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
gb_random.fit(X_train, y_train)
gb_random.best_params_

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb_clf= GradientBoostingClassifier(random_state=4, n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_features='sqrt', max_depth=10, learning_rate=0.05)
gb_clf.fit(X_train, y_train)
from sklearn.metrics import accuracy_score
print('Accuracy score on training set: ', accuracy_score(y_true = y_train, y_pred = gb_clf.predict(X_train).round()))
print('Accuracy score on testing set: ', accuracy_score(y_true = y_test, y_pred = gb_clf.predict(X_test).round()))

# 3.8 KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)  
#You can change these hyperparameters like metric etc.
knn_clf.fit(X_train, y_train)
print('Accuracy on training set:',knn_clf.score(X_train,y_train))
print('Accuracy on test set:',knn_clf.score(X_test,y_test))

# 3.9 Kernel SVM 

In [None]:
from sklearn.svm import SVC
kernel_svm_clf = SVC(kernel = 'rbf', random_state = 0)
kernel_svm_clf.fit(X_train, y_train)
print('Accuracy on training set:',kernel_svm_clf.score(X_train,y_train))
print('Accuracy on test set:',kernel_svm_clf.score(X_test,y_test))

# 3.10 AdaBoost

In [None]:
ada_clf= AdaBoostClassifier()
ada_clf.fit(X_train, y_train)
from sklearn.metrics import accuracy_score
print('Accuracy score on training set: ', accuracy_score(y_true = y_train, y_pred = ada_clf.predict(X_train).round()))
print('Accuracy score on testing set: ', accuracy_score(y_true = y_test, y_pred = ada_clf.predict(X_test).round()))

# 3.11 Ensambling

# 3.11.1 Gradient and RandomForest

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(
 estimators=[('gradient', gb_clf), ('randomforest', rf_clf)],
 voting='hard')
voting_clf.fit(X_train, y_train)
print('Accuracy on training set:',voting_clf.score(X_train, y_train))
print('Accuracy on test set:',voting_clf.score(X_test, y_test))

# 4.0 Pipeline

> Instead of the regular table that we have been making for all our projects i tried to make a different approach by forming this pipeline of the best models

In [None]:
n_components = 5
pipelines = []
n_estimators = 200
seed = 1

pipelines.append( ('NB',
                   Pipeline([
                              ('sc', StandardScaler()),
                             ('GNB', GaussianNB())]) ) )


pipelines.append(('KNN',
                  Pipeline([ 
                              ('sc', StandardScaler()),
                            ('KNN', KNeighborsClassifier()) ])))

pipelines.append( ('RF',
                   Pipeline([
                              ('sc', StandardScaler()),
                             ('RF', RandomForestClassifier(random_state=seed, n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_depth=50, bootstrap=False)) ]) ))


pipelines.append( ('AB',
                   Pipeline([ 
                              ('sc', StandardScaler()),
                    ('Ada', AdaBoostClassifier(random_state=seed,  n_estimators=n_estimators)) ]) ))

pipelines.append( ('ET',
                   Pipeline([
                              ('sc', StandardScaler()), 
                             ('ET', ExtraTreesClassifier(random_state=seed, n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_depth=50, bootstrap=False)) ]) ))

pipelines.append( ('GB',
                   Pipeline([ 
                             ('sc', StandardScaler()),
                             ('GB', GradientBoostingClassifier(random_state=4, n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_features='sqrt', max_depth=10, learning_rate=0.05)) ]) ))

pipelines.append( ('LR',
                   Pipeline([
                              ('sc', StandardScaler()), 
                             ('LR', LogisticRegression(random_state=seed)) ]) ))

pipelines.append( ('DT',
                   Pipeline([
                              ('sc', StandardScaler()), 
                             ('DT', DecisionTreeClassifier(random_state=seed)) ]) ))

pipelines.append( ('SVC',
                   Pipeline([
                              ('sc', StandardScaler()), 
                             ('KSVM', SVC(random_state=seed)) ]) ))

pipelines.append( ('XGB',
                   Pipeline([
                              ('sc', StandardScaler()), 
                             ('XGB', XGBClassifier(random_state=seed)) ]) ))

pipelines.append( ('GB&RF',
                   Pipeline([
                              ('sc', StandardScaler()), 
                             ('VC', voting_clf) ]) ))

results, names, times  = [], [] , []
num_folds = 10
scoring = 'accuracy'

for name, model in pipelines:
    start = time()
    kfold = StratifiedKFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring = scoring,
                                n_jobs=-1) 
    t_elapsed = time() - start
    results.append(cv_results)
    names.append(name)
    times.append(t_elapsed)
    msg = "%s: %f (+/- %f) performed in %f seconds" % (name, 100*cv_results.mean(), 
                                                       100*cv_results.std(), t_elapsed)
    print(msg)


fig = plt.figure(figsize=(12,8))    
fig.suptitle("Algorithms comparison")
ax = fig.add_subplot(1,1,1)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()