# Projeto Glass Identification
Id number: 1 to 214
RI: refractive index
Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)
Mg: Magnesium
Al: Aluminum
Si: Silicon
K: Potassium
Ca: Calcium
Ba: Barium
Fe: Iron
Type of glass: (class attribute) -- 1 building_windows_float_processed -- 2 building_windows_non_float_processed -- 3 vehicle_windows_float_processed -- 4 vehicle_windows_non_float_processed (none in this database) -- 5 containers -- 6 tableware -- 7 headlamps

In [58]:
import warnings
warnings.filterwarnings('ignore')

In [63]:
!pip install -q huggingface_hub

In [64]:
!pip install -q joblib

In [65]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.3-py3-none-any.whl (7.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.3


In [68]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
glass_identification = fetch_ucirepo(id=42) 
  
# data (as pandas dataframes) 
X = glass_identification.data.features 
Y = glass_identification.data.targets 
  
# metadata 
#print(glass_identification.metadata) 
  
# variable information 
print(glass_identification.variables) 

             name     role         type demographic       description  \
0       Id_number       ID      Integer        None              None   
1              RI  Feature   Continuous        None  refractive index   
2              Na  Feature   Continuous        None            Sodium   
3              Mg  Feature   Continuous        None         Magnesium   
4              Al  Feature   Continuous        None          Aluminum   
5              Si  Feature   Continuous        None           Silicon   
6               K  Feature   Continuous        None         Potassium   
7              Ca  Feature   Continuous        None           Calcium   
8              Ba  Feature   Continuous        None            Barium   
9              Fe  Feature   Continuous        None              Iron   
10  Type_of_glass   Target  Categorical        None              None   

                                    units missing_values  
0                                    None             no  
1    

In [69]:
X.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0


In [70]:
Y.head()

Unnamed: 0,Type_of_glass
0,1
1,1
2,1
3,1
4,1


In [71]:
df = X.copy()
df.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0


In [72]:
df.columns.values

array(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe'], dtype=object)

In [73]:
len(df.columns.values)

9

In [74]:
# As Classes
Y['Type_of_glass'].unique()

array([1, 2, 3, 5, 6, 7], dtype=int64)

In [75]:
Y['Type_of_glass'].value_counts() / len(Y) * 100


2    35.514019
1    32.710280
7    13.551402
3     7.943925
5     6.074766
6     4.205607
Name: Type_of_glass, dtype: float64

# Creating a model
Separation of data in training and testing

In [76]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=65) #20% em teste

x_train.shape, x_test.shape

((171, 9), (43, 9))

# The algorithm XGBoost - algorithm ensemble based on decision trees

In [77]:
# Install
!pip install -q xgboost

In [78]:
import xgboost as xgb

In [79]:
set(y_train)
y_train.head()

Unnamed: 0,Type_of_glass
208,7
17,1
92,2
165,5
196,7


In [84]:
def converte_clasficacao_XGBoost_para_classe(classificacao_glass_XGBoost):
    
    if classificacao_glass_XGBoost == 0:
        classificacao_glass = 1
    elif classificacao_glass_XGBoost == 1:
        classificacao_glass = 2
    elif classificacao_glass_XGBoost == 2:
        classificacao_glass = 3
    elif classificacao_glass_XGBoost == 3:
        classificacao_glass = 5
    elif classificacao_glass_XGBoost == 4:
        classificacao_glass = 6
    elif classificacao_glass_XGBoost == 5:
        classificacao_glass = 7

    return classificacao_glass 

In [83]:
def converte_clasficacao_classe_para_XGBoost(classificacao_glass):
    
    if classificacao_glass == 1:
        classificacao_glass = 0
    elif classificacao_glass == 2:
        classificacao_glass = 1
    elif classificacao_glass == 3:
        classificacao_glass = 2
    elif classificacao_glass == 5:
        classificacao_glass = 3
    elif classificacao_glass == 6:
        classificacao_glass = 4
    elif classificacao_glass == 7:
        classificacao_glass = 5

    return classificacao_glass

In [85]:
labels = [1, 2, 3, 5, 6, 7]
print(labels)

[1, 2, 3, 5, 6, 7]


In [86]:
labels_XGBoost = [0, 1, 2, 3, 4, 5]
labels_XGBoost

[0, 1, 2, 3, 4, 5]

In [88]:
for classe in labels:
    print('classe glass: ' + str(classe) + ' - Classe XGBoost: '  +  \
           str(converte_clasficacao_classe_para_XGBoost(classe)))

classe glass: 1 - Classe XGBoost: 0
classe glass: 2 - Classe XGBoost: 1
classe glass: 3 - Classe XGBoost: 2
classe glass: 5 - Classe XGBoost: 3
classe glass: 6 - Classe XGBoost: 4
classe glass: 7 - Classe XGBoost: 5


# Prepare data for the algorithm XGBoost - Converter/transform data

In [89]:
y = list()

for classe in list(y_train.values):

    y_train_XGBoost = converte_clasficacao_classe_para_XGBoost(classe)
    #print(str(classe) + ' - ' + str(y_train_XGBoost))
    y.append(y_train_XGBoost)
    
print(y[0:5])

[5, 0, 1, 3, 5]


In [90]:
set(y)

{0, 1, 2, 3, 4, 5}

# Training the model

In [91]:

%%time
learning_rate = 1.1   
xgb_classifier = xgb.XGBClassifier(eta = learning_rate , n_estimators=200)
xgb_classifier.fit(x_train.values, y)
xgb_classifier

CPU times: total: 1.33 s
Wall time: 415 ms


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eta=1.1, eval_metric=None,
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=200,
              n_jobs=None, num_parallel_tree=None, ...)

# Model Data

In [96]:
modela = xgb_classifier

In [97]:
model.get_params()

{'objective': 'multi:softprob',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': 200,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None,
 'eta': 1.1}

In [122]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

In [60]:
df=pd.read_csv('glass.csv')

In [21]:
df.head()

Unnamed: 0,RI,Na,Mg,AI,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [22]:
df.tail()

Unnamed: 0,RI,Na,Mg,AI,Si,K,Ca,Ba,Fe,Type
209,1.51623,14.14,0.0,2.88,72.61,0.08,9.18,1.06,0.0,7
210,1.51685,14.92,0.0,1.99,73.06,0.0,8.4,1.59,0.0,7
211,1.52065,14.36,0.0,2.02,73.42,0.0,8.44,1.64,0.0,7
212,1.51651,14.38,0.0,1.94,73.61,0.0,8.48,1.57,0.0,7
213,1.51711,14.23,0.0,2.08,73.36,0.0,8.62,1.67,0.0,7


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RI      214 non-null    float64
 1   Na      214 non-null    float64
 2   Mg      214 non-null    float64
 3   AI      214 non-null    float64
 4   Si      214 non-null    float64
 5   K       214 non-null    float64
 6   Ca      214 non-null    float64
 7   Ba      214 non-null    float64
 8   Fe      214 non-null    float64
dtypes: float64(9)
memory usage: 15.2 KB


In [14]:
def normalize(df):
    result = df.copy()
    for feature_name in df.column:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name]=(df[feature_name]-min_value) / (max_value - min_value)
    return result

In [27]:
df.describe()

Unnamed: 0,RI,Na,Mg,AI,Si,K,Ca,Ba,Fe
count,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0
mean,1.518365,13.40785,2.684533,1.444907,72.650935,0.497056,8.956963,0.175047,0.057009
std,0.003037,0.816604,1.442408,0.49927,0.774546,0.652192,1.423153,0.497219,0.097439
min,1.51115,10.73,0.0,0.29,69.81,0.0,5.43,0.0,0.0
25%,1.516522,12.9075,2.115,1.19,72.28,0.1225,8.24,0.0,0.0
50%,1.51768,13.3,3.48,1.36,72.79,0.555,8.6,0.0,0.0
75%,1.519157,13.825,3.6,1.63,73.0875,0.61,9.1725,0.0,0.1
max,1.53393,17.38,4.49,3.5,75.41,6.21,16.19,3.15,0.51


In [31]:
df.head()

Unnamed: 0,RI,Na,Mg,AI,Si,K,Ca,Ba,Fe
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0


In [32]:
df['Type']=label

In [33]:
df.head(1)

Unnamed: 0,RI,Na,Mg,AI,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1


In [35]:
#df = pd.Dataframe(np.random.rndn(100,3))

from scipy import stats
df=df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]

In [36]:
len(df)

194

In [100]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [109]:
from sklearn.model_selection import train_test_split

In [123]:
# Read the train and test datasets from Kaggle to create two DataFrames using Pandas
glass_df = pd.read_csv("glass.csv")
glass_df.head()

Unnamed: 0,RI,Na,Mg,AI,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1
