<a id="1"></a>

<h1 style="background-color:skyblue;font-family:newtimeroman;font-size:250%;text-align:center">Libraires And Utilities</h1>


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as ex
import plotly.graph_objs as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
pyo.init_notebook_mode()
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
from wordcloud import WordCloud,STOPWORDS
import nltk 
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from keras import Sequential
from keras.layers import Dense,LSTM,Embedding
from sklearn.metrics import mean_squared_error
from keras.utils.vis_utils import plot_model
import tensorflow as tf
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

def rmse(y,yhat):
    return np.sqrt((y-yhat)**2)

def set_seed(seed=31415):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed()
plt.rc('figure',figsize=(17,13))


### Loading The Data

In [None]:
b_data = pd.read_csv('/kaggle/input/highly-rated-children-books-and-stories/children_stories.Csv',encoding="ISO-8859-1" )
b_data.head(3)


<a id="2"></a>

<h1 style="background-color:skyblue;font-family:newtimeroman;font-size:250%;text-align:center">Preprocessing And Feature Engineering</h1>


In [None]:
def get_min_age(sir):
    if sir.find('-') != -1:
        sp =  re.sub(' +', ' ', sir)
        sp = sp.split(' ')[1]
        sp = sp.strip()
        return int(sp.split('-')[0])
    else:
        if sir.find('months'):
            return 0
        return int(sir.replace('+','').split(' ')[1])
def get_max_age(sir):
    if sir.find('-') != -1:
        sp =  re.sub(' +', ' ', sir)
        sp = sp.split(' ')[1]
        sp = sp.strip()
        return int(sp.split('-')[1])
    else:
        return 99
    
#Preprocessing And Feature Engineering
b_data.names = b_data.names.str.lower()
b_data.desc = b_data.desc.str.lower()
b_data['Min_Age'] = b_data.cats.apply(get_min_age)
b_data['Max_Age'] = b_data.cats.apply(get_max_age)
b_data['Book Name Nb Words'] = b_data.names.apply(lambda x: len(re.sub(' +', ' ', x).strip().split(' ')))
b_data['Book Name Avg Word Length'] = b_data.names.apply(lambda x: np.mean(np.array([len(word) for word in (re.sub(' +', ' ', x).strip().split(' '))])))
b_data['Range Length'] = b_data.Max_Age - b_data.Min_Age


#Sentiment Analysis
sid = SIA()
b_data['sentiments']           = b_data['desc'].apply(lambda x: sid.polarity_scores(' '.join(re.findall(r'\w+',x.lower()))))
b_data['Positive Sentiment']   = b_data['sentiments'].apply(lambda x: x['pos']+1*(10**-6)) 
b_data['Neutral Sentiment']    = b_data['sentiments'].apply(lambda x: x['neu']+1*(10**-6))
b_data['Negative Sentiment']   = b_data['sentiments'].apply(lambda x: x['neg']+1*(10**-6))

b_data.drop(columns=['sentiments'],inplace=True)

In [None]:
b_data.head(5)

<a id="3"></a>

<h1 style="background-color:skyblue;font-family:newtimeroman;font-size:250%;text-align:center">Exploratory Data Analysis</h1>


In [None]:
f_data=b_data.copy()
plt.subplot(2,1,1)
plt.title('Distriubtion Of Sentiments Across Our Book Desriptions',fontsize=19,fontweight='bold')
sns.kdeplot(f_data['Negative Sentiment'],bw=0.1)
sns.kdeplot(f_data['Positive Sentiment'],bw=0.1)
sns.kdeplot(f_data['Neutral Sentiment'],bw=0.1)
plt.subplot(2,1,2)
plt.title('CDF Of Sentiments Across Our Book Desriptions',fontsize=19,fontweight='bold')
sns.kdeplot(f_data['Negative Sentiment'],bw=0.1,cumulative=True)
sns.kdeplot(f_data['Positive Sentiment'],bw=0.1,cumulative=True)
sns.kdeplot(f_data['Neutral Sentiment'],bw=0.1,cumulative=True)
plt.xlabel('Sentiment Value',fontsize=19)
plt.show()

In [None]:
min_age_g = b_data.groupby(by='Min_Age').mean()
min_age_g = min_age_g.reset_index()
min_age_g
#plt.figure(figsize=(20,11))
fig = ex.box(b_data,x='Min_Age',y='Book Name Avg Word Length')
fig.update_layout(xaxis=dict(dtick=1),title='Average Book Name Word Length Over Min Target Ages')
fig.show()

In [None]:
fig = make_subplots(rows=2, cols=1,shared_xaxes=True,subplot_titles=('Perason Correaltion',  'Spearman Correaltion'))
colorscale=     [[1.0              , "rgb(165,0,38)"],
                [0.8888888888888888, "rgb(215,48,39)"],
                [0.7777777777777778, "rgb(244,109,67)"],
                [0.6666666666666666, "rgb(253,174,97)"],
                [0.5555555555555556, "rgb(254,224,144)"],
                [0.4444444444444444, "rgb(224,243,248)"],
                [0.3333333333333333, "rgb(171,217,233)"],
                [0.2222222222222222, "rgb(116,173,209)"],
                [0.1111111111111111, "rgb(69,117,180)"],
                [0.0               , "rgb(49,54,149)"]]

s_val =b_data.corr('pearson')
s_idx = s_val.index
s_col = s_val.columns
s_val = s_val.values
fig.add_trace(
    go.Heatmap(x=s_col,y=s_idx,z=s_val,name='pearson',showscale=False,xgap=1,ygap=1,colorscale=colorscale),
    row=1, col=1
)


s_val =b_data.corr('spearman')
s_idx = s_val.index
s_col = s_val.columns
s_val = s_val.values
fig.add_trace(
    go.Heatmap(x=s_col,y=s_idx,z=s_val,xgap=1,ygap=1,colorscale=colorscale),
    row=2, col=1
)

fig.update_layout(height=700, width=900, title_text="Locations That Contribute The Most To Our Cut-Offs")
fig.show()

In [None]:
min_age_descs = []
for i in range(1,12):
    min_age_descs.append(' '.join(b_data.query('Min_Age == {}'.format(i)).names))


fig,axes = plt.subplots(3,4,figsize=(20,20),facecolor = None)
fig.set_figwidth=(20)
fig.set_figheight(18)
axes[2,3].set_axis_off()

r,c=0,0
for i in range(1,12):
    axes[r,c].imshow(WordCloud(stopwords=STOPWORDS,background_color='white').generate(min_age_descs[i-1]))
    axes[r,c].axis('off')
    axes[r,c].set_title('Book Title WC Min Age = {}'.format(i),fontsize=16,fontweight='bold')
    c+=1
    if c == 4:
        r+=1
        c=0

fig.tight_layout(pad=0.5)


In [None]:
min_age_descs = []
for i in range(1,12):
    min_age_descs.append(' '.join(b_data.query('Min_Age == {}'.format(i)).desc))


fig,axes = plt.subplots(3,4,figsize=(20,20),facecolor = None)
fig.set_figwidth=(20)
fig.set_figheight(18)
axes[2,3].set_axis_off()

r,c=0,0
for i in range(1,12):
    axes[r,c].imshow(WordCloud(stopwords=STOPWORDS,background_color='white').generate(min_age_descs[i-1]))
    axes[r,c].axis('off')
    axes[r,c].set_title('Description WC Min Age = {}'.format(i),fontsize=16,fontweight='bold')
    c+=1
    if c == 4:
        r+=1
        c=0

fig.tight_layout(pad=0.5)


<a id="4"></a>

<h1 style="background-color:skyblue;font-family:newtimeroman;font-size:250%;text-align:center">Vectoriztion And Decomposition</h1>


In [None]:
desc_number_of_components = 210
title_number_of_components = 225


desc_vecotrizer = CountVectorizer()
desc_matrix = desc_vecotrizer.fit_transform(b_data.desc)
desc_svd = TruncatedSVD(n_components=desc_number_of_components)
svd_desc_matrix = desc_svd.fit_transform(desc_matrix)

title_vecotrizer = CountVectorizer()
title_matrix = title_vecotrizer.fit_transform(b_data.names)
title_svd = TruncatedSVD(n_components=title_number_of_components)
svd_title_matrix = title_svd.fit_transform(title_matrix)


In [None]:
desc_ex_var = np.cumsum(desc_svd.explained_variance_ratio_)

tr1 = go.Scatter(x=np.arange(0,len(desc_ex_var)),y=desc_ex_var)
go.Figure(data=[tr1],layout=dict(title='Description Counts Explained Variance Using {} Components'.format(desc_number_of_components),xaxis_title='# Componenets',yaxis_title='Total Variance Explained'))

In [None]:
title_ex_var = np.cumsum(title_svd.explained_variance_ratio_)

tr1 = go.Scatter(x=np.arange(0,len(title_ex_var)),y=title_ex_var)
go.Figure(data=[tr1],layout=dict(title='Title Counts Explained Variance Using {} Components'.format(title_number_of_components),xaxis_title='# Componenets',yaxis_title='Total Variance Explained'))

<a id="5"></a>

<h1 style="background-color:skyblue;font-family:newtimeroman;font-size:250%;text-align:center">Model Selection And Evaluation</h1>


# $\text{Dense Relu Neural Network Construction}$

In [None]:
model = Sequential(
    [Dense(input_dim =desc_number_of_components,activation='relu',units=100),
    Dense(activation='relu',units=50),
    Dense(activation='relu',units=20),
    Dense(activation='relu',units=2)]
)

model.compile(optimizer='adam',loss='mse',metrics=['mae'])

In [None]:
plot_model(model,show_shapes=True, show_layer_names=True)

In [None]:
history = model.fit(svd_desc_matrix,b_data[['Min_Age','Max_Age']],epochs=550,verbose=False)

# $\text{Dense Relu Neural Network Evaluation}$

In [None]:
fig = go.Figure()
H_diff = pd.DataFrame(history.history)

fig.add_trace(go.Scatter(
    x= np.arange(len(H_diff)),
    y=H_diff.loss,
    name='Loss'
))
fig.add_trace(go.Scatter(
    x= np.arange(len(H_diff)),
    y=H_diff.mae,
    name='Mean Abs Error'
))

fig.update_layout(title='Model Training Evaluation',xaxis_title='Iterration',yaxis_title='Value')
fig.show()

In [None]:
predictions = pd.DataFrame({'Min_Age':model.predict(svd_desc_matrix)[:,0],'Max_Age':model.predict(svd_desc_matrix)[:,1]})
predictions.Min_Age = np.round(predictions.Min_Age).astype(np.int)
predictions.Max_Age = (np.round(predictions.Max_Age)).astype(np.int)

In [None]:
fig = make_subplots(rows=2,cols=1)

fig.add_trace(go.Scatter(
    x= np.arange(len(H_diff)),
    y=b_data['Min_Age'].sort_values(),
    name='Actual Min Age'
),row=1,col=1)
fig.add_trace(go.Scatter(
    x= np.arange(len(H_diff)),
    y=predictions['Min_Age'].sort_values(),
    name='Predicted Min Age'
),row=1,col=1)


fig.add_trace(go.Scatter(
    x= np.arange(len(H_diff)),
    y=b_data['Max_Age'].sort_values(),
    name='Actual Max Age'
),row=2,col=1)
fig.add_trace(go.Scatter(
    x= np.arange(len(H_diff)),
    y=predictions['Max_Age'].sort_values(),
    name='Predicted Max Age'
),row=2,col=1)


fig.update_layout(title='Model Prediction Evaluation',xaxis_title='Iterration',yaxis_title='Value')
fig.show()