### Dataset Information

    RI: refractive index
    Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)
    Mg: Magnesium
    Al: Aluminum
    Si: Silicon
    K: Potassium
    Ca: Calcium
    Ba: Barium
    Fe: Iron
    
    Type of glass: 
        1 building_windows_float_processed
        2 building_windows_non_float_processed
        3 vehicle_windows_float_processed
        4 vehicle_windows_non_float_processed (none in this database)
        5 containers
        6 tableware
        7 headlamps

### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
pd.set_option('display.max_columns', None)
plt.style.use('ggplot')

### Reading the data

In [None]:
glass = pd.read_csv("../input/glass/glass.csv")
glass.head()

### Features in data

In [None]:
# the data columns
cols = ['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe']
target = ["Type"]

### Dataset Info

In [None]:
glass.info()

### Statistical Summary

In [None]:
glass.describe()

### Target Countplot

In [None]:
sns.countplot(glass["Type"])
plt.show()

### Checking distribution of the features

In [None]:
fig,ax = plt.subplots(3,3, figsize=(16, 12))
ax = ax.flatten()
i = 0
for col in cols:
    skew = glass[col].skew()
    sns.distplot(glass[col], ax = ax[i], fit= stats.norm, kde=False, label='Skew = %.3f' %(skew))
    ax[i].legend(loc='best')
    i += 1
plt.show()

    None of the features are normally distributed and some have outliers

    Note: Outlier treatment maybe done to check impact on classification

### Univariate Box Plot

In [None]:
glass.iloc[:,:-1].boxplot(figsize=(12,6))
plt.show()

### Observations:
    - Silicon is the main component of Glass making more than 70% of composition
    - Combined Silicon, Sodium and Calcium make up around 90%
    - Iron is the least important component

Above box plot confirms the outliers

    I prefer to use models without outlier treatment, in many cases it can improve the model performance.
    But it also leads to change of information which might alter real/practical situations

### Bivariate Box plots

In [None]:
fig,ax = plt.subplots(3,3, figsize=(16, 12))
ax = ax.flatten()
i = 0
for col in cols:
    sns.boxplot("Type", col, ax = ax[i], data=glass)
    ax[i].legend([col], loc='best')
    i += 1
plt.tight_layout()
plt.show()

### Inferences
    - Refractive index lies between 1.51 and 1.54
    - Type 6 and 7 have higher Na %
    - Type 1,2 and 3 have higher Mg %
    - Type 5 and 7 have higher Al %
    - Si % is similar in all types
    - Type 6 has no K composition
    - Type 5 and 6 have higher Ca composition
    - Ba is mostly used in Type 7
    - Fe is used in Type 1,2 and 3

### Scatter Matrix

In [None]:
pd.plotting.scatter_matrix(glass.iloc[:,:-1], c=glass.iloc[:,-1], figsize=(20, 20), marker='o')
plt.legend(glass["Type"].unique())
plt.show()

### Pairplot

In [None]:
sns.pairplot(glass, hue='Type', diag_kind='hist')
plt.show()

### Correlation Plot

In [None]:
plt.figure(figsize=(8,6))
corr = glass.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, annot=True, fmt= '.2f', cmap='YlGnBu', mask=mask)
plt.show()

    K and Ca have no correlation with Type, which means for some type it maybe high for some low causing cancelling effect

### Feature Engineering - Based on the mean of K and Ca in classes

In [None]:
glass.groupby("Type")["Ca"].mean()

In [None]:
glass.groupby("Type")["K"].mean()

In [None]:
glass["Ca_morethan9"] = np.where(glass["Ca"]>9, 1, 0)
glass["K_morethandot7"] = np.where(glass["K"]>0.7, 1, 0)
glass["K_lessthandot4"] = np.where(glass["K"]<0.4, 1, 0)

In [None]:
cols.append("Ca_morethan9")
cols.append("K_morethandot7")
cols.append("K_lessthandot4")

### Statistical Importance Check for Variable

In [None]:
import statsmodels.api as sm
import statsmodels.stats as sms

for col in cols:
    data = sm.formula.ols(col+"~ Type", data=glass).fit()
    pval = sms.anova.anova_lm(data)["PR(>F)"][0]
    print(f"Pval for {col}: {pval}")

#### K and Ca are not siginificant, but the new variables we have created are significant. KUDOS!!!

### Data Preprocessing & Evaluation Functions

In [None]:
seed = 1

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier,\
                            BaggingClassifier,VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline

In [None]:
# split the data into train and test
def split_data(X, Y, seed=1, train_size=0.7):
    xtrain, xtest, ytrain, ytest = train_test_split(X, Y, train_size=train_size, random_state = seed, stratify=Y)
    xtrain, xtest = preprocess(xtrain, xtest)
    return (xtrain, xtest, ytrain, ytest)

# preprocess the data for training
def preprocess(x1, x2=None):
    sc = StandardScaler()
    x1 = pd.DataFrame(sc.fit_transform(x1), columns=x1.columns)
    if x2 is not None:
        x2 = pd.DataFrame(sc.transform(x2), columns=x2.columns)
        return (x1,x2)
    return x1

# for model evaluation and training
def eval_model(model, X, Y, seed=1):
    xtrain, xtest, ytrain, ytest = split_data(X, Y)
    model.fit(xtrain, ytrain)
    
    trainpred = model.predict(xtrain)
    trainpred_prob = model.predict_proba(xtrain)
    testpred = model.predict(xtest)
    testpred_prob = model.predict_proba(xtest)
    
    print("Train ROC AUC : %.4f"%roc_auc_score(ytrain, trainpred_prob, multi_class='ovr'))
    print("\nTrain classification report\n",classification_report(ytrain, trainpred))
    
    ### make a bar chart for displaying the wrong classification of one class coming in which other class
    
    print("\nTest ROC AUC : %.4f"%roc_auc_score(ytest, testpred_prob, multi_class='ovr'))
    print("\nTest classification report\n",classification_report(ytest, testpred))
    
def plot_importance(columns, importance):
    plt.bar(columns, importance)
    plt.show()

### Separating the X and Y data

In [None]:
X = glass.drop(["Type"], axis=1)
X_sc = preprocess(X)
Y = glass["Type"]

### Creating array of models

In [None]:
model_logr = LogisticRegression(random_state=seed,n_jobs=-1)
model_nb = GaussianNB()
model_dt = DecisionTreeClassifier(random_state=seed)
model_dt_bag = BaggingClassifier(model_dt, random_state=seed, n_jobs=-1)
model_ada = AdaBoostClassifier(random_state=seed)
model_gbc = GradientBoostingClassifier(random_state=seed)
model_rf = RandomForestClassifier(random_state=seed, n_jobs=-1)
model_xgb = XGBClassifier(random_state=seed)
model_lgbm = LGBMClassifier(random_state=seed, n_jobs=-1)
model_knn = KNeighborsClassifier(n_jobs=-1)

models = []
models.append(('LR',model_logr))
models.append(('NB',model_nb))
models.append(('DT',model_dt))
models.append(('Bag',model_dt_bag))
models.append(('Ada',model_ada))
models.append(('GBC',model_gbc))
models.append(('RF',model_rf))
models.append(('XGB',model_xgb))
models.append(('LGBM',model_lgbm))
models.append(('KNN',model_knn))

### Running the algorithms

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

results = []
names = []

for name, model in models:
    scores = cross_val_score(model, X_sc, Y, scoring='f1_weighted', cv=cv, n_jobs=-1)
    accuracy = scores.mean()
    std = scores.std()
    print(f"{name} : Mean ROC {accuracy} STD:({std})")
    results.append(scores)
    names.append(name)

### Comparison of Models

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
ax.boxplot(results)
ax.set_xticklabels(names)
plt.show()

### Only Significant Variables

In [None]:
X = glass.drop(["Type","K","Ca"], axis=1)
X_sc = preprocess(X)
Y = glass["Type"]

In [None]:
results = []
names = []

for name, model in models:
    scores = cross_val_score(model, X_sc, Y, scoring='f1_weighted', cv=cv, n_jobs=-1)
    accuracy = scores.mean()
    std = scores.std()
    print(f"{name} : Mean ROC {accuracy} STD:({std})")
    results.append(scores)
    names.append(name)

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
ax.boxplot(results)
ax.set_xticklabels(names)
plt.show()

<H3>Random Forest Model has best performance, so we can work further on it and tune to improve performance
Model Tuning can be performed using RandomGridSearchCV or Bayesian Optimization which I will add further.

# Application of PCA
<b>PCA is a statistical method which can help identify the pattern in data and also help in dimensionality reduction. However, there is a misconception that PCA reduces the set of variables but this is not the case.
PCA transforms the variables into a new coordinate system with variables accounting for the maximum variance in the data.</b>
<p><b> Also remember that PCA requires standardized data as the variable scale can heavily impact the transformation and end up with total garbage</b></p>

<H3>A small tip, eigenvalues >0.7 indicate a strong variable importance

In [None]:
from sklearn.decomposition import PCA

X = glass.drop(["Type"], axis=1)
X_std = preprocess(X)
pca = PCA(n_components=None)
# None means that we are selecting all the principal components. Once again, WE ARE NOT DROPPING ANY VARIABLES.
pca.fit(X_std)

In [None]:
# the eigenvalues
pca.explained_variance_

In [None]:
# the % of variance explained
var_exp = pca.explained_variance_ratio_
cum_var = np.cumsum(pca.explained_variance_ratio_)
print("Cummulative variance:\n", cum_var)
plt.plot(range(1, len(var_exp)+1), cum_var, color='r', marker='^', label="Cummulative Variance")
plt.bar(range(1, len(var_exp)+1), var_exp, color='r', label="Individual Variance")
plt.legend(loc='best')
plt.title("PCA components vs Variance Explained")
plt.show()

First two components represent almost 50% variance in the data.

The first 8 components represent more than 95% variance in the data.

In [None]:
pca8 = PCA(n_components = 8)
X_pca = pd.DataFrame(pca8.fit_transform(X_std))
X_pca.head()

In [None]:
results = []
names = []

for name, model in models:
    scores = cross_val_score(model, X_pca, Y, scoring='f1_weighted', cv=cv, n_jobs=-1)
    accuracy = scores.mean()
    std = scores.std()
    print(f"{name} : Mean ROC {accuracy} STD:({std})")
    results.append(scores)
    names.append(name)

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
ax.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
pca9 = PCA(n_components = 10)
X_pca = pd.DataFrame(pca9.fit_transform(X_std))
X_pca.head()

In [None]:
results = []
names = []

for name, model in models:
    scores = cross_val_score(model, X_pca, Y, scoring='f1_weighted', cv=cv, n_jobs=-1)
    accuracy = scores.mean()
    std = scores.std()
    print(f"{name} : Mean ROC {accuracy} STD:({std})")
    results.append(scores)
    names.append(name)

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
ax.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
pca9 = PCA(n_components = 0.99)
X_pca = pd.DataFrame(pca9.fit_transform(X_std))
X_pca.head()

In [None]:
results = []
names = []

for name, model in models:
    scores = cross_val_score(model, X_pca, Y, scoring='f1_weighted', cv=cv, n_jobs=-1)
    accuracy = scores.mean()
    std = scores.std()
    print(f"{name} : Mean ROC {accuracy} STD:({std})")
    results.append(scores)
    names.append(name)

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
ax.boxplot(results)
ax.set_xticklabels(names)
plt.show()

### Thank You for viewing my Kernel, if you like it or have any suggestions you are welcome. Also please upvote the Kernel.