<a id="1.1"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Libraries And Utilities</h3>


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as ex
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import plotly.offline as pyo
pyo.init_notebook_mode()
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import re
import string
from wordcloud import STOPWORDS,WordCloud
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import os
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
def rmse(y,y_hat):
    return np.sqrt(mean_squared_error(y,y_hat))

plt.rc('figure',figsize=(20,11))
plt.rc('font',size=12)

<a id="1.2"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Data Importation And Missing Value Assessment</h3>


In [None]:
a_data =pd.read_csv('/kaggle/input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')
a_data.head(3)


In [None]:
a_data.info()

In [None]:
#text Preprocessing
a_data['Name'] = a_data['Name'].apply(lambda x: x.lower())
a_data['Name'] = a_data['Name'].apply(lambda x: x.translate(str.maketrans(' ', ' ', string.punctuation)))
sid = SentimentIntensityAnalyzer()
a_data['Sentiment'] = a_data.Name.apply(lambda x:sid.polarity_scores(x))
a_data['Positive Sentiment'] = a_data.Sentiment.apply(lambda x: x['pos'])
a_data['Neutral Sentiment'] = a_data.Sentiment.apply(lambda x: x['neu'])
a_data['Negative Sentiment'] = a_data.Sentiment.apply(lambda x: x['neg'])
a_data['Compound Sentiment'] = a_data.Sentiment.apply(lambda x: x['compound'])
a_data.drop(columns=['Sentiment'],inplace=True)

<a id="2"></a>
<h1 style="background-color:orange;font-family:newtimeroman;font-size:300%;text-align:center;border-radius: 15px 50px;">Exploratory Data Analysis</h1>


In [None]:
info = a_data.iloc[:,:-5].describe()
info.loc['median'] = a_data.median()
info.loc['skew'] = a_data.skew()
info.loc['kurtosis'] = a_data.kurt()

info

<a id="1.1"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Numeric Variables Distributions</h3>


In [None]:
ax =sns.distplot(a_data['User Rating'],bins=15)
ax.set_title('Distribution of ratings across the dataset',fontsize=19)
plt.show()

In [None]:
plt.subplot(2,1,1)
ax =sns.distplot(a_data['Price'],color='red')
ax.set_title('Distribution of prices across the dataset',fontsize=19)
plt.show()
plt.subplot(2,1,2)
#remove outliers
a_data = a_data.query('Price < 60')
ax =sns.distplot(a_data['Price'],color='red')
ax.set_title('Distribution of prices across the dataset (After Outlier Removal)',fontsize=19)
plt.show()

In [None]:
plt.subplot(2,1,1)
ax =sns.distplot(a_data['Reviews'],color='teal')
ax.set_title('Distribution of review counts across the dataset',fontsize=19)
plt.show()
plt.subplot(2,1,2)
a_data['Reviews'] =np.log(a_data['Reviews'])
ax =sns.distplot((a_data['Reviews']),color='teal')
ax.set_title('Distribution of review counts across the dataset',fontsize=19)
plt.show()

In [None]:
fig = ex.pie(a_data,names='Genre',title='Proportion Of Different Geners in Our Dataset',hover_data=['Genre'])
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

<a id="1.1"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Year Based Analysis</h3>


In [None]:
pivot = a_data.pivot_table(values='Price',columns='Year',index='Genre')
plt.title('Prices Of Each Genre Over The Years')
sns.heatmap(pivot,annot=True)

### We can see that both genres even though they are in different price ranges they are similarly experiencing a decrease in their price over the years.

In [None]:
gby_year = a_data.groupby('Year').mean()
gby_year_median = a_data.groupby('Year').median()

tr1 = go.Scatter(x=gby_year.index,y=gby_year['Price'],name='Mean')
tr2 = go.Scatter(x=gby_year_median.index,y=gby_year_median['Price'],name='Median')


layout = dict(title='Average Book Price Over The Years',yaxis_title='Mean Price',xaxis_title='Year')
fig = go.Figure(data=[tr1,tr2],layout=layout)


fig.show()

### There is a clear trend, we can see that the average book price is decreasing with each year

In [None]:
tr1 = go.Scatter(x=gby_year.index,y=gby_year['Reviews'],name='Mean')
tr2 = go.Scatter(x=gby_year_median.index,y=gby_year_median['Reviews'],name='Median')

layout = dict(title='Average Book Review Count Over The Years',yaxis_title='Mean Number Of Reviews',xaxis_title='Year')
go.Figure(data=[tr1,tr2],layout=layout)

### There is a clear trend, we can see that the average review count increases with each year

In [None]:
tr1 = go.Scatter(x=gby_year.index,y=gby_year['User Rating'],name='Mean')
tr2 = go.Scatter(x=gby_year_median.index,y=gby_year_median['User Rating'],name='Median')

layout = dict(title='Average Book User Rating Over The Years',yaxis_title='Mean Rating',xaxis_title='Year')
go.Figure(data=[tr1,tr2],layout=layout)

### There is a clear trend, we can see that the average rating score increase with each year

In [None]:
print('There are -{}- unique number of authors in our dataset!'.format(len(a_data['Author'].unique())))

<a id="1.1"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Text Based Analysis</h3>


In [None]:
ax = sns.barplot(y=a_data['Author'].value_counts()[:10].index,x=a_data['Author'].value_counts()[:10].values,palette='mako')
ax.set_title('Top 10 Authors',fontsize=16)
ax.set_xlabel('Number Of Books in Our Dataset',fontsize=14)
plt.show()

In [None]:
plt.subplot(3,1,1)
sns.scatterplot(x=a_data['Compound Sentiment'],y=a_data['Price'])
plt.subplot(3,1,2)
sns.scatterplot(x=a_data['Compound Sentiment'],y=a_data['Reviews'])
plt.subplot(3,1,3)
sns.scatterplot(x=a_data['Compound Sentiment'],y=a_data['User Rating'])
plt.show()

### We can clearly see that there is no significant correaltion between the sentiment compounds and our numeric features (price, review counts, and rating)

In [None]:
plt.subplot(3,1,1)
ax =sns.distplot((a_data[a_data['Compound Sentiment']>0]['Price']),label='Positive Compound')
ax =sns.distplot((a_data[a_data['Compound Sentiment']<0]['Price']),label='Negative Compound')
plt.legend()
plt.show()
plt.subplot(3,1,2)
ax =sns.distplot((a_data[a_data['Compound Sentiment']>0]['Reviews']),label='Positive Compound')
ax =sns.distplot((a_data[a_data['Compound Sentiment']<0]['Reviews']),label='Negative Compound')
plt.legend()
plt.show()
plt.subplot(3,1,3)
ax =sns.distplot((a_data[a_data['Compound Sentiment']>0]['User Rating']),label='Positive Compound')
ax =sns.distplot((a_data[a_data['Compound Sentiment']<0]['User Rating']),label='Negative Compound')
plt.legend()
plt.show()

### We can clearly see that there is no significant change between the sentiment compounds and our numeric features (price, review counts, and rating) distribution

In [None]:
plt.imshow(WordCloud(width=900,height=600,stopwords=STOPWORDS).generate(' '.join(a_data.Name.values)))
plt.axis('off')
plt.title('Most Frequent Words In Our Book Names',fontsize=16)
plt.show()

<a id="1.1"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Using The Book Name Feature as A Predictor For Price</h3>


In [None]:
NUMBER_OF_COMPONENTS = 180

vectorizer = TfidfVectorizer()
sp_matrix = vectorizer.fit_transform(a_data['Name'])

svd_truncer = TruncatedSVD(n_components=NUMBER_OF_COMPONENTS)
dec_mat = svd_truncer.fit_transform(sp_matrix)

In [None]:
cu_sum = np.cumsum(svd_truncer.explained_variance_ratio_)
tr1 = go.Scatter(x=np.arange(0,len(cu_sum)),y=cu_sum)
layout=dict(yaxis_title='Explained Variance',xaxis_title='# Of Components',title='Explained Variance Of Name Tfidf Matrix Using {} Components'.format(NUMBER_OF_COMPONENTS))
go.Figure(data=[tr1],layout=layout)

In [None]:
train_x,test_x,train_y,test_y = train_test_split(dec_mat,a_data['Price'],random_state=42)


LR_Pipe = Pipeline(steps = [('model',LinearRegression())])
LR_Pipe.fit(train_x,train_y)    
RF_Pipe = Pipeline(steps = [('model',RandomForestRegressor(random_state=42))])
RF_Pipe.fit(train_x,train_y)    


lr_predictions = LR_Pipe.predict(test_x)
rf_predictions = RF_Pipe.predict(test_x)

plt.subplot(2,1,1)
plt.title('Linear Regression Residual Plot')
sns.residplot(lr_predictions,test_y)

plt.subplot(2,1,2)
plt.title('Random Forest Residual Plot')
sns.residplot(rf_predictions,test_y)
plt.show()

### We will use the linear regression model for our meta feature, we see that in comparison to the random forest model it looks like the linear regression tends to show less heteroskedasticity.


In [None]:
L_Encoder = LabelEncoder()
LR_Pipe.fit(dec_mat,a_data['Price'])
a_data['LR_Pred']  = RF_Pipe.predict(dec_mat)
a_data['Genre'] = L_Encoder.fit_transform(a_data['Genre'] )


<a id="1.1"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Price Prediction Using Stacking And Random Forest Model</h3>


In [None]:
X = a_data[['Year','Genre','Compound Sentiment','LR_Pred']]
Y = a_data['Price']
RF_Pipe.fit(X,Y)

plt.title('Final Price Prediction Residuals sing Random Forest')
ax = sns.residplot(RF_Pipe.predict(X),Y)
textstr = f'RMSE: {np.round(rmse(RF_Pipe.predict(X),Y),2)}'
props = dict(boxstyle='round', facecolor='tab:blue', alpha=0.5)
ax.text(0.05, 0.95, textstr, transform=ax.transAxes, fontsize=29,
        verticalalignment='top', bbox=props)
plt.show()

In [None]:
output = pd.DataFrame({"Actual":Y,'Prediction':RF_Pipe.predict(X)})

fig = make_subplots(
    rows=3, cols=2,subplot_titles=('','Actual','Predictions','Residuals'),
    vertical_spacing=0.09,
    specs=[[{"type": "table","rowspan": 3}     ,{"type": "scatter"}] ,
           [None                               ,{"type": "scatter"}]            ,           
           [None                               ,{"type": "scatter"}]                           
          ]
)

fig.add_trace(
    go.Scatter(
        x=np.arange(0,len(output["Actual"])),
        y=output["Actual"],
        mode="markers",
    ),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(
        x=np.arange(0,len(output["Prediction"])),
        y=output["Prediction"],
        mode="markers",
    ),
    row=2, col=2
)

fig.add_trace(
    go.Scatter(
        x=np.arange(0,len(output["Prediction"])),
        y=output["Prediction"]-output["Actual"],
        mode="markers",
    ),
    row=3, col=2
)

fig.add_trace(
    go.Table(
        header=dict(
            values=['Prediction','Actual'],
            font=dict(size=10),
            align="left"
        ),
        cells=dict(
            values=[output[k].tolist() for k in output.columns],
            align = "left")
    ),
    row=1, col=1
)



fig.add_shape(type="line",
    x0=0, y0=(output["Prediction"]-output["Actual"]).mean(), x1=len(output["Prediction"]), y1=(output["Prediction"]-output["Actual"]).mean(),
    line=dict(
        color="Red",
        width=2,
        dash="dashdot",
    ),
        name='Mean',
        xref='x3', 
        yref='y3'
)

fig.update_layout(
    height=800,
    showlegend=False,
    title_text="Prediction Evaluation",
)

fig.show()

In [None]:
output = pd.DataFrame({'Book Name':a_data['Name'],'Year':a_data["Year"],'Actual Price':a_data['Price'],'Predicted Price':RF_Pipe.predict(X)})
output.to_csv('Price_Prediction.csv',index=False)