In [1]:
#%matplotlib inline
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd

In [2]:
diamonds = pd.read_csv('diamonds.csv')
diamonds.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
diamonds["price"].max()

18823

In [4]:
diamonds["price"].min()

326

In [5]:
diamonds["price"]

0         326
1         326
2         327
3         334
4         335
         ... 
53935    2757
53936    2757
53937    2757
53938    2757
53939    2757
Name: price, Length: 53940, dtype: int64

## Create Bins for the Diamond Prices

In [6]:
# Create bins in which to place values based upon Diamond Price
bins = [0, 499, 999, 2499, 4999, 7499, 9999, 14999, 19999]

In [7]:
label_groups = [1,2,3,4,5,6,7,8]

In [8]:
# Create labels for these bins
group_labels = ["0-500", "500-1000", "1000-2500", "2500-5000", "5000-7500", "7500-10000", "1000-15000", "15000-20000"]

In [9]:
# Slice the data and place it into bins
diamonds["bins"] = pd.cut(diamonds["price"], bins, labels=label_groups)
diamonds.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,bins
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,1
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,1
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,1
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,1
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,1


In [10]:
#.values.reshape(-1,1)

## Stratify the Dataframe

In [11]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(diamonds, diamonds["bins"]):
    strat_train_set = diamonds.loc[train_index]
    strat_test_set = diamonds.loc[test_index]

strat_test_set.count()

Unnamed: 0    10788
carat         10788
cut           10788
color         10788
clarity       10788
depth         10788
table         10788
price         10788
x             10788
y             10788
z             10788
bins          10788
dtype: int64

In [18]:
data = [strat_train_set, strat_test_set]
data = pd.concat(data)
data

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,bins
37140,37141,0.40,Very Good,F,VVS2,62.9,55.0,972,4.72,4.75,2.98,2
38148,38149,0.50,Very Good,I,VS1,63.1,56.0,1013,5.07,5.01,3.18,3
34466,34467,0.38,Ideal,F,VS1,62.0,54.0,865,4.65,4.68,2.89,2
14746,14747,1.14,Ideal,H,VS2,61.6,56.0,5937,6.72,6.68,4.13,5
10291,10292,1.00,Premium,D,SI2,61.6,60.0,4758,6.37,6.33,3.91,4
...,...,...,...,...,...,...,...,...,...,...,...,...
17609,17610,1.02,Ideal,F,VS1,60.7,56.0,7091,6.53,6.61,3.99,5
37819,37820,0.32,Ideal,F,VVS2,61.3,56.0,1002,4.42,4.45,2.72,3
11822,11823,1.12,Good,H,SI1,59.8,61.0,5094,6.72,6.85,4.06,5
6315,6316,1.00,Good,E,SI2,64.0,54.0,4026,6.31,6.26,4.01,4


In [23]:
X = data[["carat", "cut", "color", "clarity"]]
y = data["bins"]
print(X.shape, y.shape)

(53940, 4) (53940,)


## Dummy Encoding (Binary Encoded Data)

In [24]:
data = X.copy()

data_binary_encoded = pd.get_dummies(data, columns=["cut", "color", "clarity"])
data_binary_encoded.head()

Unnamed: 0,carat,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
37140,0.4,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
38148,0.5,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
34466,0.38,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
14746,1.14,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
10291,1.0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


## Scaling and Normalization

In [25]:
#Normalization

from sklearn.model_selection import train_test_split

X = data_binary_encoded

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train.head()

Unnamed: 0,carat,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
27234,1.64,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
31867,0.3,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
16980,1.04,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
778,0.83,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
5731,1.09,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [26]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)

In [27]:
# Transform the training and testing data using the X_scaler

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [28]:
from tensorflow.keras.utils import to_categorical

In [29]:
# One-hot encoding
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

In [None]:
nodes = [21,42,63, 1]

In [None]:
hidden_layers = [0,1,2]

In [None]:
epochs_count = [20,40,60]

In [None]:
train_data_node_count = []
train_data_accuracy = []
train_data_loss = []
train_data_epochs = []

test_data_node_count = []
test_data_accuracy = []
test_data_loss = []
test_data_epochs = []


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
model = Sequential()

k = 0

for i in range(3):
    count = nodes[i]
    
    if i == 0:
        print("No Hidden Layers: \n")
        for j in range(3):
            current_epochs_count = epochs_count[j]

            print(f"Epochs count: {current_epochs_count} \n")
            for i in range(len(nodes)):
                print(f"Model: {k}")
                #create 4 models with varying node sizes 21,42,63,126
                model.add(Dense(units=count, activation='relu', input_dim=21))
                model.add(Dense(units=9, activation='softmax'))
                #print(model.summary())

                # Compile the model
                model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

                # Fit the model to the training data
                model.fit(
                    X_train_scaled,
                    y_train_categorical,
                    epochs=current_epochs_count,
                    shuffle=True,
                    verbose=0
                )

                # Print the number of nodes for this model
                print(f"Number of Nodes: {nodes[i]} \n")

                # Print the training data accuracy
                model_loss, model_accuracy = model.evaluate(X_train_scaled, y_train_categorical, verbose=2)
                print(f"Normal Neural Network - Loss: {model_loss}, Train Data Accuracy: {model_accuracy}  \n")

                # Append aquired data to lists
                train_data_node_count.append(nodes[i])
                train_data_accuracy.append(model_accuracy)
                train_data_loss.append(model_loss)
                train_data_epochs.append(current_epochs_count)

                # Print the test data accuracy
                model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
                print(f"Normal Neural Network - Loss: {model_loss}, Test Data Accuracy: {model_accuracy} \n")

                # Append aquired data to lists
                test_data_node_count.append(nodes[i])
                test_data_accuracy.append(model_accuracy)
                test_data_loss.append(model_loss)
                test_data_epochs.append(current_epochs_count)
                
                # Save the Model
                model.save(f"Models/diamond_model{k}_trained.h5")
                k += 1
            
            
    if i == 1:
        print("One Hidden Layer: \n")
        for j in range(3):
            current_epochs_count = epochs_count[j]

            print(f"Epochs count: {current_epochs_count} \n")
            for i in range(len(nodes)):
                print(f"Model: {k}")
                model.add(Dense(units=count, activation='relu', input_dim=21))
                model.add(Dense(units=count, activation='relu'))
                model.add(Dense(units=9, activation='softmax'))
                #print(model.summary())

                # Compile the model
                model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

                # Fit the model to the training data
                model.fit(
                    X_train_scaled,
                    y_train_categorical,
                    epochs=current_epochs_count,
                    shuffle=True,
                    verbose=0
                )

                # Print the number of nodes for this model
                print(f"Number of Nodes: {nodes[i]} \n")

                # Print the training data accuracy
                model_loss, model_accuracy = model.evaluate(X_train_scaled, y_train_categorical, verbose=2)
                print(f"Normal Neural Network - Loss: {model_loss}, Train Data Accuracy: {model_accuracy}  \n")

                # Append aquired data to lists
                train_data_node_count.append(nodes[i])
                train_data_accuracy.append(model_accuracy)
                train_data_loss.append(model_loss)
                train_data_epochs.append(current_epochs_count)

                # Print the test data accuracy
                model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
                print(f"Normal Neural Network - Loss: {model_loss}, Test Data Accuracy: {model_accuracy} \n")

                # Append aquired data to lists
                test_data_node_count.append(nodes[i])
                test_data_accuracy.append(model_accuracy)
                test_data_loss.append(model_loss)
                test_data_epochs.append(current_epochs_count)
                
                # Save the Model
                model.save(f"Models/diamond_model{k}_trained.h5")
                k += 1
    
    if i == 2:
        print("Two Hidden Layers: \n")
        for j in range(3):
            current_epochs_count = epochs_count[j]

            print(f"Epochs count: {current_epochs_count} \n")
            for i in range(len(nodes)):
                print(f"Model: {k}")

                model.add(Dense(units=count, activation='relu', input_dim=21))
                model.add(Dense(units=count, activation='relu'))
                model.add(Dense(units=count, activation='relu'))
                model.add(Dense(units=9, activation='softmax'))
                #print(model.summary())

                # Compile the model
                model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

                # Fit the model to the training data
                model.fit(
                    X_train_scaled,
                    y_train_categorical,
                    epochs=current_epochs_count,
                    shuffle=True,
                    verbose=0
                )

                # Print the number of nodes for this model
                print(f"Number of Nodes: {nodes[i]} \n")

                # Print the training data accuracy
                model_loss, model_accuracy = model.evaluate(X_train_scaled, y_train_categorical, verbose=2)
                print(f"Deep Learning - Loss: {model_loss}, Train Data Accuracy: {model_accuracy}  \n")

                # Append aquired data to lists
                train_data_node_count.append(nodes[i])
                train_data_accuracy.append(model_accuracy)
                train_data_loss.append(model_loss)
                train_data_epochs.append(current_epochs_count)

                # Print the test data accuracy
                model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
                print(f"Deep Learning - Loss: {model_loss}, Test Data Accuracy: {model_accuracy} \n")

                # Append aquired data to lists
                test_data_node_count.append(nodes[i])
                test_data_accuracy.append(model_accuracy)
                test_data_loss.append(model_loss)
                test_data_epochs.append(current_epochs_count)
                
                # Save the Model
                model.save(f"Models/diamond_model{k}_trained.h5")
                k += 1
                
# Print the lists
print(f"Train Data Node Count: {train_data_node_count} \n")
print(f"Train Data Accuracy: {train_data_accuracy} \n")
print(f"Train Data Data Loss: {train_data_loss} \n")
print(f"Train Data Epochs Count: {train_data_epochs} \n")

print(f"Test Data Node Count: {test_data_node_count} \n")
print(f"Test Data Accuracy: {test_data_accuracy} \n")
print(f"Test Data Data Loss: {test_data_loss} \n")
print(f"Test Data Epochs Count: {test_data_epochs} \n")

In [None]:
# Print the lists
print(f"Train Data Node Count: {train_data_node_count} \n")
print(f"Train Data Accuracy: {train_data_accuracy} \n")
print(f"Train Data Data Loss: {train_data_loss} \n")
print(f"Train Data Epochs Count: {train_data_epochs} \n")
print(f"Test Data Node Count: {test_data_node_count} \n")
print(f"Test Data Accuracy: {test_data_accuracy} \n")
print(f"Test Data Data Loss: {test_data_loss} \n")
print(f"Test Data Epochs Count: {test_data_epochs} \n")

In [None]:
hidden_layers = [0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2]

In [None]:
total_nodes = [21, 42, 63, 126, 21, 42, 63, 126, 21, 42, 63, 126, 42, 84, 126, 252, 42, 84, 126, 252, 42, 84, 126, 252,63, 126, 189, 378, 63, 126, 189, 378, 63, 126, 189, 378]

In [None]:
train_data_node_count = [21, 42, 63, 126, 21, 42, 63, 126, 21, 42, 63, 126, 21, 42, 63, 126, 21, 42, 63, 126, 21, 42, 63, 126, 21, 42, 63, 126, 21, 42, 63, 126, 21, 42, 63, 126] 

train_data_accuracy = [0.8646397, 0.8661476, 0.8684959, 0.8629341, 0.8625634, 0.86758125, 0.8682981, 0.8683475, 0.86530715, 0.8593993, 0.86852056, 0.85000616, 0.82489187, 0.79886293, 0.84194785, 0.7897417, 0.86281055, 0.84459275, 0.5981708, 0.24367817, 0.24367817, 0.24367817, 0.2353232, 0.24367817, 0.2353232, 0.24367817, 0.24367817, 0.24367817, 0.24367817, 0.2353232, 0.24367817, 0.24367817, 0.24367817, 0.24367817, 0.24367817, 0.24367817] 

train_data_loss = [0.32638762755706924, 0.31687921332187347, 0.31224977897560474, 0.3193319055106168, 0.31694140674790616, 0.30918416789665076, 0.3105929220544881, 0.310247789906595, 0.3143123491543242, 0.3322642105668249, 0.320961675899843, 0.49307033180704285, 0.4448682196604986, 0.49764072974832313, 0.3564974833272942, 0.5004219041215726, 0.33888261037532336, 0.3538104200430197, 0.8435268622189631, 1.8319895689455927, 1.831906346460898, 1.8318545534177286, 1.8322802179514766, 1.8320177968567048, 1.832375104102315, 1.8321261641976565, 1.8329056692954386, 1.8325805363232532, 1.8320385142348632, 1.832089532336616, 1.8320831213491306, 1.8320261195185983, 1.8319670956857625, 1.832040193434213, 1.8320767065220778, 1.83210375088871] 

train_data_epochs = [20, 20, 20, 20, 40, 40, 40, 40, 60, 60, 60, 60, 20, 20, 20, 20, 40, 40, 40, 40, 60, 60, 60, 60, 20, 20, 20, 20, 40, 40, 40, 40, 60, 60, 60, 60] 

test_data_node_count = [21, 42, 63, 126, 21, 42, 63, 126, 21, 42, 63, 126, 21, 42, 63, 126, 21, 42, 63, 126, 21, 42, 63, 126, 21, 42, 63, 126, 21, 42, 63, 126, 21, 42, 63, 126] 

test_data_accuracy = [0.86481273, 0.86384875, 0.86637, 0.86414534, 0.86154985, 0.8660734, 0.867779, 0.8684464, 0.8595477, 0.8577679, 0.86414534, 0.84553206, 0.8263997, 0.79814607, 0.8428624, 0.7896181, 0.86036336, 0.8461253, 0.59058213, 0.23604004, 0.23604004, 0.23604004, 0.24100854, 0.23604004, 0.24100854, 0.23604004, 0.23604004, 0.23604004, 0.23604004, 0.24100854, 0.23604004, 0.23604004, 0.23604004, 0.23604004, 0.23604004, 0.23604004] 

test_data_loss = [0.3279775432051665, 0.32547561949078224, 0.318984039432259, 0.3284233139760856, 0.32702030120047987, 0.31973137202685437, 0.3247508165568478, 0.3256604062304216, 0.332950400086213, 0.3432109690371115, 0.3394853159039384, 0.5329931833189773, 0.446375084939956, 0.5037969000656869, 0.36398571616064235, 0.49918112209606136, 0.3489040634788581, 0.3637865602638618, 0.849710601062124, 1.8268041317048318, 1.8267153571082524, 1.826545627381477, 1.8262304564604372, 1.8271198802936683, 1.8263489943236655, 1.8269844997534719, 1.8274736148143282, 1.8276972956510664, 1.8262701239195143, 1.82635777277729, 1.8267180785188686, 1.827012778318587, 1.8268577996209059, 1.8266934674803663, 1.8269574710692482, 1.8263713490048028] 

test_data_epochs = [20, 20, 20, 20, 40, 40, 40, 40, 60, 60, 60, 60, 20, 20, 20, 20, 40, 40, 40, 40, 60, 60, 60, 60, 20, 20, 20, 20, 40, 40, 40, 40, 60, 60, 60, 60] 

In [None]:
trained_model_dataframe = pd.DataFrame({
    "Hidden Layers":hidden_layers,
    "Epochs Count":train_data_epochs,
    "Total Node Count":total_nodes,
    "Train Data Node Count":train_data_node_count,
    "Train Data Loss":train_data_loss,
    "Train Data Accuracy":train_data_accuracy,
    "Test Data Loss":test_data_loss,
    "Test Data Accuracy":test_data_accuracy
    #"Test Data Node Count":test_data_node_count,
    #"Test Data Epochs Count":test_data_epochs
})
trained_model_dataframe

In [None]:
np.corrcoef([total_nodes, hidden_layers, train_data_epochs], y=train_data_accuracy, rowvar=True, bias=False, ddof=None)

In [None]:
np.corrcoef([total_nodes, hidden_layers, train_data_epochs], y=test_data_accuracy, rowvar=True, bias=False, ddof=None)

In [None]:
np.corrcoef(hidden_layers, y=test_data_accuracy, rowvar=True, bias=False, ddof=None)

In [None]:
trained_model

In [None]:
# first, create a normal neural network with 2 inputs, 6 hidden nodes, and 2 outputs
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(units=63, activation='relu', input_dim=21))
model.add(Dense(units=63, activation='relu'))
model.add(Dense(units=63, activation='relu'))
model.add(Dense(units=9, activation='softmax'))

In [None]:
model.summary()

In [None]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# Fit the model to the training data
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

In [None]:
model_loss, model_accuracy = model.evaluate(
    X_train_scaled, y_train_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")