In [2]:
import pandas as pd
import numpy as np
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [3]:
trainData = pd.read_csv('Glass_Quality_Participants_Data/Train.csv')

In [None]:
# Number of columns = 16  Train
# Number of rows = 1358   Train

# Number of columns = 15  Test
# Number of rows = 583   Test

In [None]:
# Columns
# Index(['grade_A_Component_1', 'grade_A_Component_2', 'max_luminosity',
#        'thickness', 'xmin', 'xmax', 'ymin', 'ymax', 'pixel_area', 'log_area',
#        'x_component_1', 'x_component_2', 'x_component_3', 'x_component_4',
#        'x_component_5', 'class'],
#       dtype='object')

In [4]:
features = trainData.iloc[:,:-1]
labels = trainData.iloc[:,-1]

In [5]:
labels = labels.values.reshape(-1,1)
# labels = np.array([i[0]-1 for i in labels]).reshape(-1,1)
labels.shape

(1358, 1)

In [6]:
def ohencoder(labels):
    return np.array([[0,1] if i[0] == 2 else [1,0] for i in labels])

ohencoded_labels = ohencoder(labels)

In [None]:
ohencoded_labels.shape

In [7]:
features = features.astype('float')

In [None]:
features.info()

In [None]:
# x and grade_A are categorical variables
# No missing values

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

features.hist(column=["xmax", "xmin","ymin","ymax",])

In [None]:
features.hist(column=["max_luminosity","pixel_area","log_area"])#,,,

In [None]:
features.corr()

In [None]:
# pixel_area and log_area
# xmin and xmax
# ymin and ymax
# The above pairs have a very high degree of positive correlation

In [8]:
features = features.drop(columns=['pixel_area', 'xmax','ymax'])

In [9]:
tfeatures = features.loc[:,['max_luminosity','thickness','xmin','ymin','log_area']]

In [10]:
scaler = StandardScaler()
scaler.fit(tfeatures)
tfeatures = scaler.transform(tfeatures)

In [11]:
tfeatures = pd.DataFrame(tfeatures,columns=['max_luminosity','thickness','xmin','ymin','log_area'])

In [12]:
features = features.drop(columns=['max_luminosity','thickness','xmin','ymin','log_area'])

In [13]:
normalized_features = pd.concat([features,tfeatures],axis=1)

In [None]:
normalized_features

Merging categoricals

In [14]:
f = normalized_features.copy()
cat_features = f.loc[:,['grade_A_Component_1','grade_A_Component_2','x_component_1','x_component_2','x_component_3','x_component_4','x_component_5']]


cat_features['newf'] = cat_features.apply(lambda x: str(int(x[0]))+str(int(x[1]))+str(int(x[2]))+str(int(x[3]))+str(int(x[4]))+str(int(x[5]))+str(int(x[6])), axis=1)

cat_features = cat_features.drop(columns = ['grade_A_Component_1','grade_A_Component_2','x_component_1','x_component_2','x_component_3','x_component_4','x_component_5'])

cat_features = cat_features.astype('string')

labelencoder = LabelEncoder()
cat_features['newf'] = pd.DataFrame(labelencoder.fit_transform(cat_features.iloc[:,0]))

from numpy import unique
n_labels = len(unique(cat_features['newf']))
n_labels

11

In [15]:
normalized_features = normalized_features.drop(columns = ['grade_A_Component_1','grade_A_Component_2','x_component_1','x_component_2','x_component_3','x_component_4','x_component_5'])


In [16]:
norm_cat_features = pd.concat([normalized_features,cat_features],axis=1)

In [None]:
norm_cat_features

In [17]:
unique_newf = np.unique(norm_cat_features['newf'])
unique_newf.shape[0]

11

In [107]:
from keras.models import Model
from keras.layers import Input, Dense, Concatenate, Reshape, Dropout
from keras.layers.embeddings import Embedding

inputs = []
embeddings = []
newf_cat = Input(shape=(1,))

embedding = Embedding(11, 6, input_length=1)(newf_cat)
embedding = Reshape(target_shape=(6,))(embedding)
inputs.append(newf_cat)
embeddings.append(embedding)


In [108]:
othercols = [c for c in norm_cat_features.columns if not(c == 'newf')]
othercols

['max_luminosity', 'thickness', 'xmin', 'ymin', 'log_area']

In [109]:
input_numeric = Input(shape=(5,))
embedding_numeric = Dense(16)(input_numeric) 
inputs.append(input_numeric)
embeddings.append(embedding_numeric)

In [110]:
x = Concatenate()(embeddings)
x = Dense(15, activation='relu')(x)
x = Dropout(.5)(x)
x = Dense(10, activation='relu')(x)
x = Dropout(.35)(x)
x = Dense(10, activation='relu')(x)
# x = Dropout(.15)(x)
output = Dense(2, activation='softmax')(x)

model = Model(inputs, output)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics =[tf.keras.metrics.CategoricalAccuracy()
])

In [111]:
my_callbacks = [
    EarlyStopping(patience=3),
    
    
]

In [112]:
np_features = norm_cat_features
np_labels = ohencoded_labels

In [113]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(np_features, np_labels, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape

In [114]:
train_input_list = []
test_input_list = []

train_input_list.append(X_train['newf'].values)
train_input_list.append(X_train[othercols].values)

test_input_list.append(X_test['newf'].values)
test_input_list.append(X_test[othercols].values)

# Fitting the model

In [273]:
history = model.fit(train_input_list, y_train, epochs = 4000, batch_size = 50, callbacks=my_callbacks, validation_data=[test_input_list, y_test]
)

Train on 1086 samples, validate on 272 samples
Epoch 1/4000
Epoch 2/4000
Epoch 3/4000
Epoch 4/4000
Epoch 5/4000
Epoch 6/4000
Epoch 7/4000
Epoch 8/4000
Epoch 9/4000
Epoch 10/4000
Epoch 11/4000


# Validation

In [274]:
testData = pd.read_csv('Glass_Quality_Participants_Data/Test.csv')

In [275]:
testfeatures = testData.drop(columns=['pixel_area', 'xmax','ymax'])

In [276]:
testfeatures = testfeatures.astype('float')

In [277]:
tfeatures = testfeatures.loc[:,['max_luminosity','thickness','xmin','ymin','log_area']]

scaler = StandardScaler()
scaler.fit(tfeatures)
tfeatures = scaler.transform(tfeatures)

tfeatures = pd.DataFrame(tfeatures,columns=['max_luminosity','thickness','xmin','ymin','log_area'])

testfeatures = testfeatures.drop(columns=['max_luminosity','thickness','xmin','ymin','log_area'])

normalized_features = pd.concat([testfeatures,tfeatures],axis=1)

In [278]:
f = normalized_features.copy()
cat_features = f.loc[:,['grade_A_Component_1','grade_A_Component_2','x_component_1','x_component_2','x_component_3','x_component_4','x_component_5']]


cat_features['newf'] = cat_features.apply(lambda x: str(int(x[0]))+str(int(x[1]))+str(int(x[2]))+str(int(x[3]))+str(int(x[4]))+str(int(x[5]))+str(int(x[6])), axis=1)

cat_features = cat_features.drop(columns = ['grade_A_Component_1','grade_A_Component_2','x_component_1','x_component_2','x_component_3','x_component_4','x_component_5'])

cat_features = cat_features.astype('string')

labelencoder = LabelEncoder()
cat_features['newf'] = pd.DataFrame(labelencoder.fit_transform(cat_features.iloc[:,0]))

In [279]:
normalized_features = normalized_features.drop(columns = ['grade_A_Component_1','grade_A_Component_2','x_component_1','x_component_2','x_component_3','x_component_4','x_component_5'])
norm_cat_features = pd.concat([normalized_features,cat_features],axis=1)

In [None]:
norm_cat_features

In [280]:
val_input_list = []

val_input_list.append(norm_cat_features['newf'].values)
val_input_list.append(norm_cat_features[othercols].values)

In [281]:
testpreds = model.predict(val_input_list)

In [282]:
testpreds_df = pd.DataFrame(testpreds, columns=['1','2'])

In [283]:
testpreds_df.to_excel('submission8.xlsx', index=False)