In [2]:
import pandas as pd
import numpy as np
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [3]:
trainData = pd.read_csv('Glass_Quality_Participants_Data/Train.csv')

In [None]:
# Number of columns = 16  Train
# Number of rows = 1358   Train

# Number of columns = 15  Test
# Number of rows = 583   Test

In [None]:
# Columns
# Index(['grade_A_Component_1', 'grade_A_Component_2', 'max_luminosity',
#        'thickness', 'xmin', 'xmax', 'ymin', 'ymax', 'pixel_area', 'log_area',
#        'x_component_1', 'x_component_2', 'x_component_3', 'x_component_4',
#        'x_component_5', 'class'],
#       dtype='object')

In [4]:
features = trainData.iloc[:,:-1]
labels = trainData.iloc[:,-1]

In [5]:
labels = labels.values.reshape(-1,1)
# labels = np.array([i[0]-1 for i in labels]).reshape(-1,1)
labels.shape

(1358, 1)

In [6]:
def ohencoder(labels):
    return np.array([[0,1] if i[0] == 2 else [1,0] for i in labels])

ohencoded_labels = ohencoder(labels)

In [None]:
ohencoded_labels.shape

In [7]:
features = features.astype('float')

In [None]:
features.info()

In [None]:
# x and grade_A are categorical variables
# No missing values

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

features.hist(column=["xmax", "xmin","ymin","ymax",])

In [None]:
features.hist(column=["max_luminosity","pixel_area","log_area"])#,,,

In [None]:
features.corr()

In [None]:
# pixel_area and log_area
# xmin and xmax
# ymin and ymax
# The above pairs have a very high degree of positive correlation

In [8]:
features = features.drop(columns=['pixel_area', 'xmax','ymax'])

In [9]:
tfeatures = features.loc[:,['max_luminosity','thickness','xmin','ymin','log_area']]

In [10]:
scaler = StandardScaler()
scaler.fit(tfeatures)
tfeatures = scaler.transform(tfeatures)

In [11]:
tfeatures = pd.DataFrame(tfeatures,columns=['max_luminosity','thickness','xmin','ymin','log_area'])

In [12]:
features = features.drop(columns=['max_luminosity','thickness','xmin','ymin','log_area'])

In [13]:
normalized_features = pd.concat([features,tfeatures],axis=1)

In [None]:
normalized_features.shape

In [None]:
normalized_features.corr()

In [14]:
np_features = normalized_features.to_numpy()
np_labels = ohencoded_labels

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(np_features, np_labels, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape

In [16]:
from keras import models, layers, optimizers
model = models.Sequential()
model.add(layers.Dense(10, activation = 'relu', input_dim = 12))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(5, activation = 'relu'))
model.add(layers.Dense(8, activation = 'relu'))
model.add(layers.Dense(2, activation = 'softmax'))


model.compile(optimizer = optimizers.adam(lr = 1e-3),
              loss = 'categorical_crossentropy',
              metrics = [tf.keras.metrics.CategoricalAccuracy()]
             )

In [17]:
my_callbacks = [
    EarlyStopping(patience=5),
    
    
]

In [197]:
history = model.fit(X_train, y_train, epochs = 400, batch_size = 20, callbacks=my_callbacks, validation_data=[X_test, y_test]
)

Train on 1086 samples, validate on 272 samples
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400


In [None]:
ypreds = model.predict(X_test)

In [None]:
ypreds

In [None]:
samplesubmission = pd.DataFrame(ypreds, columns = ['1','2'])

In [None]:
samplesubmission

In [None]:
def ontocat(arr):
    return np.array([0 if i[0]==1 else 1 for i in arr])

catypreds = ontocat(ypreds)
catytest = ontocat(y_test)

# Validation

In [198]:
testData = pd.read_csv('Glass_Quality_Participants_Data/Test.csv')

In [199]:
testfeatures = testData.drop(columns=['pixel_area', 'xmax','ymax'])

In [200]:
testfeatures = testfeatures.astype('float')

In [201]:
tfeatures = testfeatures.loc[:,['max_luminosity','thickness','xmin','ymin','log_area']]

scaler = StandardScaler()
scaler.fit(tfeatures)
tfeatures = scaler.transform(tfeatures)

tfeatures = pd.DataFrame(tfeatures,columns=['max_luminosity','thickness','xmin','ymin','log_area'])

testfeatures = testfeatures.drop(columns=['max_luminosity','thickness','xmin','ymin','log_area'])

normalized_features = pd.concat([testfeatures,tfeatures],axis=1)

In [202]:
np_testfeatures = normalized_features.to_numpy()

In [203]:
testpreds = model.predict(np_testfeatures)

In [204]:
testpreds_df = pd.DataFrame(testpreds, columns=['1','2'])

In [205]:
testpreds_df.to_excel('submission6.xlsx', index=False)