<a id="1.1"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Libraries And Utilities</h3>


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo
pyo.init_notebook_mode()
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import string
import re
from wordcloud import WordCloud,STOPWORDS
import nltk
from sklearn.metrics import mean_squared_error
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer as CVTZ
from sklearn.linear_model import LinearRegression
from keras import Sequential,Model
from keras.layers import Dense
from keras.layers.merge import concatenate
from sklearn.model_selection import train_test_split


import tensorflow as tf
import collections
def set_seed(seed=31415):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed()

def RMSE(Y,YHAT):
    return np.sqrt(mean_squared_error(Y,YHAT))

plt.rc('figure',figsize=(20,11))

<a id="1.2"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Data Importation And Missing Value Assessment</h3>


In [None]:
n_data = pd.read_csv('/kaggle/input/internet-articles-data-with-users-engagement/articles_data.csv',usecols=['source_name','author',
                                                                                                            'title','description','published_at',
                                                                                                            'top_article','engagement_reaction_count',
                                                                                                            'engagement_comment_count',
                                                                                                            'engagement_share_count'])
n_data.head(5)

In [None]:
fig = make_subplots(
    rows=2, cols=2,
    shared_xaxes=True,
    vertical_spacing=0.03,
    specs=[[{"type": "heatmap",'rowspan':2},{"type": "table",'rowspan':2}],
           [None             ,None],
          ]
)

fig.add_trace(
    go.Heatmap(
        z=n_data.isna().T.astype(int),
        x=np.arange(0,len(n_data)),
        y=n_data.columns
    ),
    row=1, col=1
)

fig.add_trace(
    go.Table(
    header=dict(values=list(['Feature Name','Number Of Missing']),
                fill_color='royalblue',
                font_color='white',
                font_size=13,
                align='left'),
    cells=dict(values=[n_data.columns,n_data.isna().sum().to_frame()[0]],
               fill_color='azure',
               font_size=14,
               align='left')),
    row=1, col=2
)




fig.update_layout(
    height=800,
    showlegend=False,
    title_text="Proportion Of Missing Values",
)
fig.update_yaxes(title_text="Sentiment Strength")
fig.show()

<p style="text-align: center;"><span style='font-size: 24px; font-family: "Times New Roman", Times, serif;'>It is important to note that there were some redundant features in my opinion that were ignored during the data loading stage together with features that had a high missing value count.</span></p>
<p><br></p>

<h1 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Data Preprocessing and Feature Engineering</h1>


In [None]:
n_data.published_at = pd.to_datetime(n_data.published_at)

n_data['Day_Of_Week'] = n_data.published_at.apply(lambda x: x.dayofweek)
n_data['Month'] = n_data.published_at.apply(lambda x: x.month)
n_data['Year'] = n_data.published_at.apply(lambda x: x.year)



In [None]:
plt.subplot(3,1,1)
plt.title("Distribution Of Engagement Reaction Counts Before Log Transformation")
sns.kdeplot(n_data.engagement_reaction_count)
plt.subplot(3,1,2)
plt.title("Distribution Of Engagement Reaction Counts After Log Transformation")
sns.kdeplot(np.log(n_data.engagement_reaction_count+0.00001))
n_data.engagement_reaction_count = np.log(n_data.engagement_reaction_count+0.00001)
plt.subplot(3,1,3)
sns.boxplot(n_data.engagement_reaction_count)
plt.show()

In [None]:
plt.subplot(3,1,1)
plt.title("Distribution Of Engagement Comment Counts Before Log Transformation")
sns.kdeplot(n_data.engagement_comment_count)
plt.subplot(3,1,2)
plt.title("Distribution Of Engagement Comment Counts After Log Transformation")
sns.kdeplot(np.log(n_data.engagement_comment_count+0.00001))
n_data.engagement_comment_count = np.log(n_data.engagement_comment_count+0.00001)
plt.subplot(3,1,3)
sns.boxplot(n_data.engagement_comment_count)
plt.show()

In [None]:
plt.subplot(3,1,1)
plt.title("Distribution Of Engagement Share Counts Before Log Transformation")
sns.kdeplot(n_data.engagement_share_count)
plt.subplot(3,1,2)
plt.title("Distribution Of Engagement Share Counts After Log Transformation")
sns.kdeplot(np.log(n_data.engagement_share_count+0.00001))
n_data.engagement_share_count = np.log(n_data.engagement_share_count+0.00001)
plt.subplot(3,1,3)
sns.boxplot(n_data.engagement_share_count)
plt.show()

<p style="text-align: center;"><span style='font-size: 24px; font-family: "Times New Roman", Times, serif;'>Interestingly we see that the Engagement Reaction Count feature distribution is very rigged and has large outliers but after we performed a log transformation we got a smooth bimodal distribution.</span></p>
<p style="text-align: center;"><span style='font-size: 24px; font-family: "Times New Roman", Times, serif;'>What is really beautiful is that all the count features provided in the dataset follow the same effect and turn into bimodal distributions after being log normalized.</span></p>
<p><br></p>

In [None]:
n_data.title = n_data.title.str.lower()
n_data = n_data[~n_data.title.isna()]
n_data.title = n_data.title.apply(lambda x: ' '.join(re.findall(r'\w+', x)) )
n_data.title = n_data.title.apply(lambda x:x.strip())

n_data.description = n_data.description.str.lower()
n_data = n_data[~n_data.description.isna()]
n_data.description = n_data.description.apply(lambda x: ' '.join(re.findall(r'\w+', x)) )
n_data.description = n_data.description.apply(lambda x:x.strip())

In [None]:
sid = SIA()
n_data['Sentiment']            = n_data.description.apply(lambda x:sid.polarity_scores(x))
n_data['DESC_Positive Sentiment']   = n_data.Sentiment.apply(lambda x: x['pos'])
n_data['DESC_Neutral Sentiment']    = n_data.Sentiment.apply(lambda x: x['neu'])
n_data['DESC_Negative Sentiment']   = n_data.Sentiment.apply(lambda x: x['neg'])

n_data.drop(columns=['Sentiment'],inplace=True)

<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>The sentiments of the description were extracted for us to determine if it has any correlation with the &quot;count&quot; features.</span></p>

<h1 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Exploratory Data Analysis</h1>


In [None]:
info = n_data.describe()
info.loc['Skewness'] = n_data.skew()
info.loc['Kurtosis'] = n_data.kurt()
info.loc['Median'] = n_data.median()
info

In [None]:
plt.subplot(1,3,1)
plt.title('Engagement Reaction Of Each Month At A Particular Day')
sns.heatmap(n_data.pivot_table(columns='Day_Of_Week',index='Month',values='engagement_reaction_count'),cbar=False,cmap='coolwarm')
plt.subplot(1,3,2)
plt.title('Engagement Comment Of Each Month At A Particular Day')
sns.heatmap(n_data.pivot_table(columns='Day_Of_Week',index='Month',values='engagement_comment_count'),cbar=False,cmap='coolwarm')
plt.subplot(1,3,3)
plt.title('Engagement Share Of Each Month At A Particular Day')
sns.heatmap(n_data.pivot_table(columns='Day_Of_Week',index='Month',values='engagement_share_count'),cmap='coolwarm')
plt.show()

<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>We can see that we have only 2 unique months in our dataset and that September Saturdays had a higher activity. </span></p>

In [None]:
ex.pie(n_data,names='top_article',title='Proportion Of Atricles Marked As "TOP ARTICLE" ')

In [None]:
plt.title("Distribution Of Different Sources In Our Dataset",fontsize=20)
sns.barplot(x=n_data.source_name.value_counts().values,y=n_data.source_name.value_counts().index)
plt.xlabel('Number Of Articles')
plt.show()

In [None]:
plt.title("Top 10 Authors In Our Dataset",fontsize=20)
sns.barplot(x=n_data.author.value_counts()[:10].values,y=n_data.author.value_counts()[:10].index)
plt.xlabel('Number Of Articles')
plt.show()

In [None]:
wc = WordCloud(width=700,height=400,stopwords=STOPWORDS).generate(' '.join(n_data.title))
plt.imshow(wc)
plt.axis('off')
plt.title("Most Used Words In Article Titles",fontsize=20)
plt.show()

In [None]:
wc = WordCloud(width=700,height=400,stopwords=STOPWORDS).generate(' '.join(n_data.description))
plt.imshow(wc)
plt.axis('off')
plt.title("Most Used Words In Article Descriptions",fontsize=20)
plt.show()

In [None]:
plt.subplot(2,1,1)
plt.title("Distirubtion Of Negative Sentiment Scores For Different Soruces")
ax = sns.boxplot(n_data['source_name'],n_data['DESC_Negative Sentiment'])
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
plt.show()
plt.subplot(2,1,2)
plt.title("Distirubtion Of Positive Sentiment Scores For Different Soruces")
ax = sns.boxplot(n_data['source_name'],n_data['DESC_Positive Sentiment'])
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
plt.show()

<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>We can see that as far as positive and negative sentiment most of our sources have fairly similar distributions and mean values but at the same time we can observe that some of our sources have noticeable range differences !</span></p>
<p><br></p>

In [None]:
fig = make_subplots(rows=2, cols=1,shared_xaxes=True,subplot_titles=('Perason Correaltion',  'Spearman Correaltion'))
colorscale=[[0.0, "rgb(165,0,38)"],
                [0.1111111111111111, "rgb(215,48,39)"],
                [0.2222222222222222, "rgb(244,109,67)"],
                [0.3333333333333333, "rgb(253,174,97)"],
                [0.4444444444444444, "rgb(254,224,144)"],
                [0.5555555555555556, "rgb(224,243,248)"],
                [0.6666666666666666, "rgb(171,217,233)"],
                [0.7777777777777778, "rgb(116,173,209)"],
                [0.8888888888888888, "rgb(69,117,180)"],
                [1.0, "rgb(49,54,149)"]]

s_val =n_data.drop(columns=['Year']).corr('pearson')
s_idx = s_val.index
s_col = s_val.columns
s_val = s_val.values
fig.add_trace(
    go.Heatmap(x=s_col,y=s_idx,z=s_val,name='pearson',showscale=False,xgap=1,ygap=1,colorscale=colorscale),
    row=1, col=1
)


s_val =n_data.drop(columns=['Year']).corr('spearman')
s_idx = s_val.index
s_col = s_val.columns
s_val = s_val.values
fig.add_trace(
    go.Heatmap(x=s_col,y=s_idx,z=s_val,xgap=1,ygap=1,colorscale=colorscale),
    row=2, col=1
)

fig.update_layout(height=700, width=900, title_text="Locations That Contribute The Most To Our Cut-Offs")
fig.show()

<h1 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Time Based Analysis</h1>


In [None]:
fig = make_subplots(rows=3, cols=1, subplot_titles=('Engagement Reaction Count','Engagement Comment Count','Engagement Share Count'))

b_date_mean = n_data.copy()
b_date_mean.published_at = pd.to_datetime(b_date_mean.published_at).dt.normalize()
b_date_mean = b_date_mean.groupby(by='published_at').mean().reset_index()

fig.add_trace(
go.Scatter(x=b_date_mean.published_at, y=b_date_mean.engagement_reaction_count,name='engagement_reaction_count'),
row=1, col=1)

fig.add_trace(
go.Scatter(x=b_date_mean.published_at, y=b_date_mean.engagement_comment_count,name='engagement_comment_count'),
row=2, col=1)

fig.add_trace(
go.Scatter(x=b_date_mean.published_at, y=b_date_mean.engagement_share_count,name='engagement_share_count'),
row=3, col=1)

fig.update_layout(height=600, width=900, title_text="Behavior Of Different Engagement Attributes Over Time")
fig.show()

<h1 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Text Based Modeling</h1>


In [None]:
tf_model = CVTZ()
N_COMPONENTS = 900

svd_model = TruncatedSVD(n_components = N_COMPONENTS)
desc_matrix = tf_model.fit_transform(n_data.description)
trunc_matrix = svd_model.fit_transform(desc_matrix)

evr = svd_model.explained_variance_ratio_
evr_cs = np.cumsum(evr)
tr1 = go.Scatter(x=np.arange(0,len(evr_cs)),y=evr_cs,name='Explained Variance Cumulative')
tr2 = go.Scatter(x=np.arange(0,len(evr_cs)),y=evr,name='Explained Variance')

fig = go.Figure(data=[tr1,tr2],layout=dict(title='Explained Variance Ratio Using {} Components'.format(N_COMPONENTS),
                                          xaxis_title='Number Of Components',yaxis_title='Explained Variance Ratio'))

fig.show()


<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>The initial approach I hopped will fit this dataset is using vectorization of the description and using the vectorized matrix as a predictor for the reaction count feature, unfortunately, both count vectorization and tf-idf vectorization had required a vary significant amount of components in order to explain a relatively small amount of the variance in the data.</span></p>
<p style="text-align: center;"><span style="font-size: 24px;"><span style="font-family: 'Times New Roman', Times, serif;"><br></span></span></p>
<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>The alternative approach we will implement next will be using a dense neutral network with a text embedding layer.</span></p>
<p><br></p>

<a id="3.1"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Vocabulary Extraction And Preprocessing</h3>


In [None]:
n_data = n_data[~n_data.engagement_reaction_count.isna()]
descs_clean = n_data['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in list(STOPWORDS) and len(word) > 1]))
vocab = collections.Counter(' '.join(descs_clean).split(' '))


MAX_LENGTH = max(descs_clean.apply(lambda x: len(x)))
VOCAB_SIZE = len(vocab.keys())
VECTOR_SPACE = 100

encoded_docs = [tf.keras.preprocessing.text.one_hot(d,VOCAB_SIZE) for d in descs_clean]

padded_docs = tf.keras.preprocessing.sequence.pad_sequences(encoded_docs,maxlen=MAX_LENGTH,padding='post')

padded_docs_eval = padded_docs[0:1000]
padded_docs = padded_docs[1000:]

Y_1 = n_data.engagement_reaction_count[1000:]
Y_eval_1 = n_data.engagement_reaction_count[:1000]

Y_2 = n_data.engagement_share_count[1000:]
Y_eval_2 = n_data.engagement_share_count[:1000]

Y_3 = n_data.engagement_comment_count[1000:]
Y_eval_3 = n_data.engagement_comment_count[:1000]

<a id="3.4"></a>
<h1 style="background-color:orange;font-family:newtimeroman;font-size:250%;text-align:center;border-radius: 15px 50px;">Engagement Reaction Model</h1>

<a id="3.2"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Sequantial Model Assembling</h3>


In [None]:
FCNN_MODEL = Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE,VECTOR_SPACE,input_length=MAX_LENGTH),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(0.5),
    Dense(activation='linear',units=13),
    Dense(activation='linear',units=1)
    
])

FCNN_MODEL.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [None]:
tf.keras.utils.plot_model(FCNN_MODEL,show_shapes=True)


In [None]:
history = FCNN_MODEL.fit(padded_docs, Y_1,validation_data=(padded_docs_eval,Y_eval_1),epochs=10,batch_size=150,verbose=False)

In [None]:
predictions = FCNN_MODEL.predict(padded_docs)
predictions = predictions.reshape(-1)

<a id="3.4"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Model Evaluation</h3>

In [None]:
h = pd.DataFrame(history.history)[['loss','mae']]
sns.lineplot(x = np.arange(0,len(h)),y=h['loss'],label='loss')
sns.lineplot(x = np.arange(0,len(h)),y=h['mae'],label='mae')
plt.title('Model Preformence')
plt.xlabel('EPOCH #')
plt.xticks(np.arange(0,len(h)))
plt.show()

In [None]:
print('ROOT MEAN SQUARED ERROR: %f' % (RMSE(predictions,Y_1)))

In [None]:
pd.DataFrame({"Actual Engagement Reaction Count":(Y_1.values),'Prediction':predictions}).head(10)

In [None]:
print('ROOT MEAN SQUARED ERROR: %f' % (RMSE(FCNN_MODEL.predict(padded_docs_eval),Y_eval_1)))

In [None]:
results = pd.DataFrame({"Actual Engagement Reaction Count":(Y_1.values),'Prediction':predictions})
results.to_csv('Predicted_Engagement.csv',index=False)
sns.residplot(x=results['Actual Engagement Reaction Count'],y=results['Prediction'])

<a id="3.4"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Conclusion</h3>

<p style="text-align: left;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>Overall the model performance is moderate, we had a fairly low RMSE on the data itself (probably due to some degree of overfitting even though a dropout layer was used)&nbsp;</span></p>
<p style="text-align: left;"><span style="font-size: 24px;"><span style="font-family: 'Times New Roman', Times, serif;">but when evaluating new data yet to be seen by the model before the performance drastically decreased.</span></span></p>
<p style="text-align: left;"><span style="font-size: 24px;"><span style="font-family: 'Times New Roman', Times, serif;"><br></span></span></p>
<p style="text-align: left;"><span style="font-family: 'Times New Roman', Times, serif;"><span style="font-size: 24px;">Feature work should include embedding of other text features</span>&nbsp;</span></p>

<a id="3.4"></a>
<h1 style="background-color:orange;font-family:newtimeroman;font-size:250%;text-align:center;border-radius: 15px 50px;">Engagement Share Model</h1>

In [None]:
FCNN_MODEL = Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE,VECTOR_SPACE,input_length=MAX_LENGTH),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(0.5),
    Dense(activation='linear',units=13),
    Dense(activation='linear',units=1)
    
])

FCNN_MODEL.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [None]:
history = FCNN_MODEL.fit(padded_docs, Y_2,validation_data=(padded_docs_eval,Y_eval_2),epochs=10,batch_size=150,verbose=False)

In [None]:
predictions = FCNN_MODEL.predict(padded_docs)
predictions = predictions.reshape(-1)

<a id="3.4"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Model Evaluation</h3>

In [None]:
h = pd.DataFrame(history.history)[['loss','mae']]
sns.lineplot(x = np.arange(0,len(h)),y=h['loss'],label='loss')
sns.lineplot(x = np.arange(0,len(h)),y=h['mae'],label='mae')
plt.title('Model Preformence')
plt.xlabel('EPOCH #')
plt.xticks(np.arange(0,len(h)))
plt.show()

In [None]:
print('ROOT MEAN SQUARED ERROR: %f' % (RMSE(predictions,Y_2)))

In [None]:
pd.DataFrame({"Actual Engagement Share Count":(Y_2.values),'Prediction':predictions}).head(10)

In [None]:
results = pd.DataFrame({"Actual Engagement Share Count":(Y_2.values),'Prediction':predictions})
results.to_csv('Predicted_Share.csv',index=False)
sns.residplot(x=results['Actual Engagement Share Count'],y=results['Prediction'])

<a id="3.4"></a>
<h1 style="background-color:orange;font-family:newtimeroman;font-size:250%;text-align:center;border-radius: 15px 50px;">Engagement Comment Model</h1>

In [None]:
FCNN_MODEL = Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE,VECTOR_SPACE,input_length=MAX_LENGTH),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(0.5),
    Dense(activation='linear',units=13),
    Dense(activation='linear',units=1)
    
])

FCNN_MODEL.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [None]:
history = FCNN_MODEL.fit(padded_docs, Y_3,validation_data=(padded_docs_eval,Y_eval_3),epochs=10,batch_size=150,verbose=False)

In [None]:
predictions = FCNN_MODEL.predict(padded_docs)
predictions = predictions.reshape(-1)

<a id="3.4"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Model Evaluation</h3>

In [None]:
h = pd.DataFrame(history.history)[['loss','mae']]
sns.lineplot(x = np.arange(0,len(h)),y=h['loss'],label='loss')
sns.lineplot(x = np.arange(0,len(h)),y=h['mae'],label='mae')
plt.title('Model Preformence')
plt.xlabel('EPOCH #')
plt.xticks(np.arange(0,len(h)))
plt.show()

In [None]:
print('ROOT MEAN SQUARED ERROR: %f' % (RMSE(predictions,Y_3)))

In [None]:
pd.DataFrame({"Actual Engagement Comment Count":(Y_3.values),'Prediction':predictions}).head(10)

In [None]:
results = pd.DataFrame({"Actual Engagement Comment Count":(Y_3.values),'Prediction':predictions})
results.to_csv('Predicted_Comment.csv',index=False)
sns.residplot(x=results['Actual Engagement Comment Count'],y=results['Prediction'])