In [None]:
import numpy as np
import pandas as pd
import keras
import sklearn
from keras.layers import Dense, Dropout
from keras.models import Sequential
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.metrics import mean_squared_error
#plt.style.use('dark_background')
import warnings
warnings.filterwarnings('ignore')
#sns.set(style = 'ticks', context = 'talk')
%matplotlib inline

In [None]:
df = pd.read_csv('../input/diamonds/diamonds.csv')
df.head()

Looks like 'Unnamed: 0' are the ID's so we can just remove that column as it doesnt prove any use to us

In [None]:
df = df.drop(['Unnamed: 0'], axis = 1) # axis representes column or row
df.head()

Lets visualize our code before any other changes

In [None]:
sns.factorplot(x = 'cut', data = df, kind = 'count', aspect = 3)

In [None]:
sns.factorplot(x = 'clarity', data = df, kind = 'count', aspect = 3)

In [None]:
sns.factorplot(x = 'color', data = df, kind = 'count', aspect = 3)

In [None]:
sns.factorplot(x = 'cut', y = 'price', data = df, kind = 'box', aspect = 3)

In [None]:
sns.factorplot(x = 'clarity', y = 'price', data = df, kind = 'box', aspect = 3)

In [None]:
sns.factorplot(x = 'color', y = 'price', data = df, kind = 'box', aspect = 3)

In [None]:
plt.figure(figsize = (12,12))
sns.heatmap(data = df.corr(), square = True, annot = True, cmap = 'BuPu')

Good! Now lets see what else we can do to the data.
It seems that we have a couple of objects in 'cut', 'clarity', and 'color' so we should go and try to convert them into categorical numbers. Lets confirm first by checking the dtypes and if there are any null values.

In [None]:
df.dtypes

3 objects. We need to conver those after we check null values

In [None]:
df.isna().sum()

all good. time to convert

In [None]:
encoder = LabelEncoder()

encoder.fit(df['cut'])
df['cut'] = encoder.transform(df['cut'])

encoder.fit(df['color'])
df['color'] = encoder.transform(df['color'])

encoder.fit(df['clarity'])
df['clarity'] = encoder.transform(df['clarity'])

df.dtypes

In [None]:
df.head()

Now that our code looks good enough, lets seperate and create our training sets

In [None]:
X = df.drop(['price'], axis = 1)
y = df['price']

print(X.head())
print()
print(y.head())

In [None]:
print('X shape: ', X.shape)
print('y shape: ', y.shape)

create testing splits now

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 6)

Normalize our data

In [None]:
X_train = (X_train - X_train.mean()) / X_train.std()
X_test = (X_test - X_test.mean()) / X_test.std()

Create keras model

In [None]:
model = Sequential()

model.add(Dense(100, input_dim = X.shape[1], activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(80, activation = 'relu'))
model.add(Dropout(0.25))
model.add(Dense(90, activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.22))
model.add(Dense(1))

model.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics = ['mse','mae'])
history = model.fit(X_train, y_train, validation_split = 0.15, batch_size = 15, epochs = 100, verbose = 0)

In [None]:
plt.plot(history.history['mse'])
plt.plot(history.history['val_mse'])
plt.title('Mean_Squared_Error')
plt.xlabel('epochs')
plt.ylabel('MSE')
plt.legend(['Training', 'Validation'], loc = 'upper right')
plt.show()

In [None]:
plt.plot(history.history['mae'])
plt.plot(history.history['val_mae'])
plt.title('Mean Absolute Error')
plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.legend(['Training', 'Validation'], loc = 'upper right')
plt.show()

In [None]:
scores = model.evaluate(X_test, y_test, verbose = 0)
print('Mean_squared_error of testing model: ', scores[1])

In [None]:
print('Mean Absolute Error of testing model: ', scores[2])

In [None]:
y_pred = model.predict(X_test).flatten() #converts 2d array into a 1d array for easy plotting

plt.axes(aspect = 'equal')
plt.scatter(y_test, y_pred)
plt.xlabel('True Prices')
plt.ylabel('Predicted Prices')
plt.xlim([0, 22000])
plt.ylim([0, 22000])
plt.plot([0, 22000], [0, 22000], color = 'red')

plt.show()

In [None]:
error = y_pred - y_test

plt.hist(error, bins = 25)
plt.xlabel('Prediction Error')
plt.ylabel('Count')

plt.show()