# Project 2

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

In [2]:
import os
from pathlib import Path
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import seaborn as sns  

In [3]:
df = pd.read_csv('data/glass.data', header=None)

# Name columns
df = df.rename(columns={0: 'Id', 1: 'RI', 2: 'Na', 3: 'Mg', 4: 'Al', 5: 'Si', 6: 'K', 7: 'Ca', 8: 'Ba', 9: 'Fe', 10: 'Type'})
# Change type of 'Type' to categorical
df['Type'] = df['Type'].astype('category')
# df['Type'] = df['Type'].cat.rename_categories({ 1: 'building_windows_float_processed',
#                                                 2: 'building_windows_non_float_processed',
#                                                 3: 'vehicle_windows_float_processed',
#                                                 4: 'vehicle_windows_non_float_processed',
#                                                 5: 'containers',
#                                                 6: 'tableware',
#                                                 7: 'headlamps'})
df['Type'] = df['Type'].cat.rename_categories({ 1: 'BW-FP',
                                                2: 'BW-NFP',
                                                3: 'VW-FP',
                                                4: 'VW-NFP',
                                                5: 'containers',
                                                6: 'tableware',
                                                7: 'headlamps'})

# Numerical variables
X = df.drop(columns=['Id', 'Type'])     # dataframe
Xe = X.drop(columns=['RI'])             # only chemical elements

X_np = X.to_numpy()                     # numpy array

# Standardize data
Xc = (X - X.mean(axis=0))               # Centering
Xs = Xc / Xc.std(axis=0, ddof=1)        # Standardization with unbiased estimator (N-1 in denominator)
# Standardise chemical elements
Xec = (Xe - Xe.mean(axis=0))            # Centering of only chemical elements
Xes = Xec / Xec.std(axis=0, ddof=1)     # Standardization with unbiased estimator for only chemical elements

# Categorical variable
y = df['Type']          # dataframe   
y_np = y.to_numpy()     # numpy array  
label_encoder = LabelEncoder()
y_num = label_encoder.fit_transform(y)

print(df.head())

   Id       RI     Na    Mg    Al     Si     K    Ca   Ba   Fe   Type
0   1  1.52101  13.64  4.49  1.10  71.78  0.06  8.75  0.0  0.0  BW-FP
1   2  1.51761  13.89  3.60  1.36  72.73  0.48  7.83  0.0  0.0  BW-FP
2   3  1.51618  13.53  3.55  1.54  72.99  0.39  7.78  0.0  0.0  BW-FP
3   4  1.51766  13.21  3.69  1.29  72.61  0.57  8.22  0.0  0.0  BW-FP
4   5  1.51742  13.27  3.62  1.24  73.08  0.55  8.07  0.0  0.0  BW-FP
