# The link of the data set:
https://www.kaggle.com/sootersaalu/amazon-top-50-bestselling-books-2009-2019

## Importing the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Read the data, Show first 5 rows

In [None]:
df = pd.read_csv("../input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv")
df.head()

## See how many books made in each year from (2009 - 2019)

In [None]:
df['Year'].value_counts()

In [None]:
sns.countplot(x=df['Genre'])
plt.title('Non Fiction VS Fiction')
plt.show()

## Histogram (Reviews, Price, Year, Genre)

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(16,10))

ax[0,0].hist(df['Reviews'], bins=100, color='r')
ax[0,0].set_title('Reviews', size=20)
ax[0,1].hist(df['Price'], bins=100, color='b')
ax[0,1].set_title('Price', size=20)
ax[1,0].hist(df['Year'], bins=50, color='orange')
ax[1,0].set_title('Year', size=20)
ax[1,1].hist(df['Genre'], bins=5, color='pink')
ax[1,1].set_title('Genre', size=20)

plt.show()

In [None]:
plt.figure(figsize=(8,6))
plt.scatter('Price', 'User Rating', data=df, color='purple')
plt.title('Price VS User Rating', size=20)
plt.xlabel('Price', size=15)
plt.ylabel('User Rating', size=15)
plt.show()

# Plot the Price column

In [None]:
plt.scatter(df['Price'], y=[i for i in range(
    len(df['Price']))], color='#42b7bd')
plt.title('Price column', size=20)
plt.xlabel('Price', size=15)
plt.ylabel('index in the data frame', size=15)
plt.show()

## Explore the Reviews column

In [None]:
# Plot
plt.plot(df['Reviews'], color='purple')
plt.title('Reviews column', size=20)
plt.xlabel('Index in the data frame', size=15)
plt.ylabel('Number of reviews', size=15)
plt.show()

## Top 10 Books that has most Reviewes

In [None]:
# Get the name of the top 10 books that has most reviews
books = df.drop_duplicates(subset='Name').nlargest(
    10, 'Reviews').iloc[:, [0, -4]].reset_index(drop=True)
print(f'The most reviewed books:')
books

## Explore the User rating column

In [None]:
df['User Rating'].hist(bins=35)
plt.title('User rating Hist')
plt.show()

## top 10 user rated books

In [None]:
# Get the name of the top 10 user rated books
ratings = df.drop_duplicates(subset='Name').nlargest(
    10, 'User Rating').iloc[:, [0, 2]].reset_index(drop=True)
print(f'The most User rated books:')
ratings

## Model bulding

# Define X and y

In [None]:
# Define the X
X = df.iloc[:, 2:]
# Drop the price column
X.drop('Price', axis=1, inplace=True)
# Map the genre column to ('Non Fiction': 0, 'Fiction':1)
X['Genre'] = X['Genre'].map({'Non Fiction': 0, 'Fiction':1})
# Convert the data frame to numpy array 
X = X.values
# Define the y and y2
y = df['Price']
y2 = df['User Rating'].values
# Convert the data Frame to a numpy array
y = y.values
# print X and y shape
X.shape, y.shape

## Normalize the data with Min Max Scaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
X = sc.fit_transform(X)

## Train the Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10)
reg = RandomForestRegressor(n_estimators = 10)
regressor.fit(X, y)
reg.fit(X,y2)

## Predict X, show the Error

In [None]:
# For the User Rating column
from sklearn import metrics
y_pred = reg.predict(X)
Result = pd.DataFrame({'Actual': y2, 'Predicted': y_pred})

print('Mean Absolute Error:', metrics.mean_absolute_error(y2, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y2, y_pred))
print('Root Mean Squared Error:', np.sqrt(
    metrics.mean_squared_error(y2, y_pred)))

Result.head()

In [None]:
# For the Price column
y_pred = regressor.predict(X)
Result = pd.DataFrame({'Actual': y, 'Predicted': y_pred})

print('Mean Absolute Error:', metrics.mean_absolute_error(y, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y, y_pred))
print('Root Mean Squared Error:', np.sqrt(
    metrics.mean_squared_error(y, y_pred)))

Result.head()

## Importing keras 

In [None]:
import keras
from keras.models import Sequential
from keras import layers

## Build the model

In [None]:
model = Sequential()
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1))

## Compile the model

In [None]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error')

## Train the model

## For the Price column

In [None]:
history = model.fit(
    X, y,
    epochs=100,
    validation_split = 0.5)

## Plot the loss VS the validation loss

In [None]:
def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.ylim([0, 10])
  plt.xlabel('Epoch')
  plt.ylabel('Error [MPG]')
  plt.legend()
  plt.grid(True)

plot_loss(history)

## For User Rating

In [None]:
history = model.fit(
    X, y2,
    epochs=100,
    validation_split = 0.5)

## Plot the loss VS the validation loss

In [None]:
plot_loss(history)