In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("C:/Users/Administrator.DAI-PC2/Desktop/ML/Day1/Glass.csv")
le = LabelEncoder()
y = le.fit_transform(df["Type"])
X = df.drop("Type", axis = 1)
le.classes_

array(['building_windows_float_processed',
       'building_windows_non_float_processed', 'containers', 'headlamps',
       'tableware', 'vehicle_windows_float_processed'], dtype=object)

In [3]:
df

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,building_windows_float_processed
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,building_windows_float_processed
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,building_windows_float_processed
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,building_windows_float_processed
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,building_windows_float_processed
...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,headlamps
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,headlamps
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,headlamps
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,headlamps


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=24, stratify=y)

kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state=24)

In [21]:
# SVC with scaling using pipeline, kernel = 'linear'

std_scaler = StandardScaler().set_output(transform='pandas')
std_mm = MinMaxScaler()
rf = RandomForestClassifier()
lr = LogisticRegression()
prcomp = PCA().set_output(transform='pandas')

pipe = Pipeline([('SCL', None), ('PCA',prcomp), ('LR',lr)])
pipe.fit(X_train, y_train)
print(np.cumsum(prcomp.explained_variance_ratio_ * 100))


y_pred = pipe.predict(X_test)
print(accuracy_score(y_test, y_pred))

y_pred_prob = pipe.predict_proba(X_test)
print(log_loss(y_test, y_pred_prob))

[ 50.55869745  75.52242703  85.91797801  95.04984585  98.29043046
  99.82989809  99.97558486  99.99998716 100.        ]
0.5538461538461539
1.1635077214587315


In [18]:
params = {'PCA__n_components': np.arange(4, 9),'LR__C': np.linspace(0.001,3,5),
          'LR__multi_class':['ovr', 'multinomial'],'SCL':[std_scaler, std_mm, None]}
gcv = GridSearchCV(pipe, param_grid = params, cv = kfold, scoring = 'neg_log_loss')
gcv.fit(X, y)
print(gcv.best_score_)
print(gcv.best_params_)

-0.9563637738159887
{'LR__C': 0.75075, 'LR__multi_class': 'multinomial', 'PCA__n_components': 8, 'SCL': StandardScaler()}


In [22]:
# random forest
pipe2 = Pipeline([('SCL', None), ('PCA',prcomp), ('RF',rf)])
pipe2.fit(X_train, y_train)
print(np.cumsum(prcomp.explained_variance_ratio_ * 100))


y_pred = pipe2.predict(X_test)
print(accuracy_score(y_test, y_pred))

y_pred_prob = pipe2.predict_proba(X_test)
print(log_loss(y_test, y_pred_prob))

[ 50.55869745  75.52242703  85.91797801  95.04984585  98.29043046
  99.82989809  99.97558486  99.99998716 100.        ]
0.6923076923076923
0.7812803407875275


In [23]:
params = {'PCA__n_components': np.arange(4, 9),'SCL':[std_scaler, std_mm, None]}
gcv = GridSearchCV(pipe2, param_grid = params, cv = kfold, scoring = 'neg_log_loss')
gcv.fit(X, y)
print(gcv.best_score_)
print(gcv.best_params_)

-0.7048909011597477
{'PCA__n_components': 8, 'SCL': StandardScaler()}
