Here, I have predicted the sentiment score of the Title and the Headline of the news articles.

The target columns are:

- `SentimentTitle`, which is the sentiment score of the Title
- `SentimentHeadline`, which is the sentiment score of the Headline

I have used `GloVe Embeddings` for the words and created a `BiLSTM` Network to predict the sentiment

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_curve

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
lemma = WordNetLemmatizer()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Bidirectional, LSTM, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding

In [None]:
nltk.download('stopwords')

In [None]:
embeddings_index = dict()
f = open('../input/glove-global-vectors-for-word-representation/glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [None]:
train = pd.read_csv('../input/news-popularity-in-multiple-social-media-platforms/train_file.csv')

In [None]:
train.head()

In [None]:
missing_val = pd.DataFrame(train.isnull().sum())
missing_val = missing_val.reset_index()
missing_val

In [None]:
train[train['Source'].isna()]

In [None]:
train.dropna(inplace=True)

In [None]:
train.info()

In [None]:
train.describe().T

In [None]:
train['Topic'].value_counts()

### EDA & Data Visualization

**NOTE:** I used the same EDA as in my other notebook where I used Custom Transformers in scikit-learn

In [None]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['Palestinian','Palestine','Microsoft','Economy','Obama','Barack'])

In [None]:
from wordcloud import WordCloud
plt.figure(figsize=(12,6))
text = ' '.join(train.Title[train['Topic']=='economy'])
wc = WordCloud(background_color='white',stopwords=stopwords).generate(text)
plt.imshow(wc)

In [None]:
from wordcloud import WordCloud
plt.figure(figsize=(12,6))
text = ' '.join(train.Title[train['Topic']=='obama'])
wc = WordCloud(background_color='white',stopwords=stopwords).generate(text)
plt.imshow(wc)

In [None]:
from wordcloud import WordCloud
plt.figure(figsize=(12,6))
text = ' '.join(train.Title[train['Topic']=='microsoft'])
wc = WordCloud(background_color='white',stopwords=stopwords).generate(text)
plt.imshow(wc)

In [None]:
from wordcloud import WordCloud
plt.figure(figsize=(12,6))
text = ' '.join(train.Title[train['Topic']=='palestine'])
wc = WordCloud(background_color='white',stopwords=stopwords).generate(text)
plt.imshow(wc)

In [None]:
sns.set(style='darkgrid',palette='Set1')
_ = sns.jointplot(x='SentimentTitle',y='SentimentHeadline',data=train,kind = 'reg')
_.annotate(stats.pearsonr)
plt.show()

In [None]:
# Bar graph exploring total sentiment for the different topics

train.groupby('Topic').agg('sum')[['SentimentHeadline', 'SentimentTitle']].plot(kind='bar', figsize=(25, 7),
                                                          stacked=True, color=['b', 'r', 'g']);

In [None]:
plt.figure(figsize=(15,15))
_ = sns.heatmap(train[['Facebook','GooglePlus','LinkedIn','SentimentTitle','SentimentHeadline']].corr(), square=True, cmap='Blues',linewidths=0.5,linecolor='w',annot=True)
plt.title('Correlation matrix ')

plt.show()

### Separating Title and headline, so that they can be trained separately

In [None]:
X_train_title = train.loc[:,'Title'].values
y_train_title = train.loc[:,['SentimentTitle']].values

X_train_headline = train.loc[:,'Headline'].values
y_train_headline = train.loc[:,['SentimentHeadline']].values

### Creating separate dataframes for Title and Headline

In [None]:
title_df=pd.DataFrame()
title_df['X_train_title']=X_train_title
title_df['y_train_title']=y_train_title

headline_df=pd.DataFrame()
headline_df['X_train_headline']=X_train_headline
headline_df['y_train_headline']=y_train_headline

### Text preprocessing function

In [None]:
from nltk.corpus import stopwords
def preprocess_text(texts):
    texts = texts.lower() 
    texts = re.sub(r'[^\x00-\x7F]+',' ', texts) 
    splitwords = texts.split()
    splitwords = filter(lambda x: x[0]!= '@' , texts.split()) 
    splitwords = [word for word in splitwords if word not in set(stopwords.words('english'))] 
    texts = " ".join(splitwords)
    return texts

### Applying preprocessor function to the title and headline text

In [None]:
title_df['X_train_title'] = title_df.X_train_title.apply(preprocess_text)
display(title_df.head())

In [None]:
headline_df['X_train_headline'] = headline_df.X_train_headline.apply(preprocess_text)
display(headline_df.head())

### Using GloVe to create word embeddings for our Title and Headline columns

In [None]:
#Creating Embeddings for the titles

max_len_title = title_df.X_train_title.apply(lambda x: len(x.split())).max()

tok_title = Tokenizer()
tok_title.fit_on_texts(title_df.X_train_title)
vocab_size_title = len(tok_title.word_index) + 1
encoded_title = tok_title.texts_to_sequences(title_df.X_train_title)
padded_title = pad_sequences(encoded_title, maxlen=max_len_title, padding='post')

vocab_size_title = len(tok_title.word_index) + 1

title_embedding_matrix = np.zeros((vocab_size_title, 50))
for word, i in tok_title.word_index.items():
    t_embedding_vector = embeddings_index.get(word)
    if t_embedding_vector is not None:
        title_embedding_matrix[i] = t_embedding_vector

In [None]:
#Creating Embeddings for the Headlines

max_len_headline = headline_df.X_train_headline.apply(lambda x: len(x.split())).max()

tok_headline = Tokenizer()
tok_headline.fit_on_texts(headline_df.X_train_headline)
vocab_size_headline = len(tok_headline.word_index) + 1
encoded_headline = tok_headline.texts_to_sequences(headline_df.X_train_headline)
padded_headline = pad_sequences(encoded_headline, maxlen=max_len_headline, padding='post')

vocab_size_headline = len(tok_headline.word_index) + 1

headline_embedding_matrix = np.zeros((vocab_size_headline, 50))
for word, i in tok_headline.word_index.items():
    h_embedding_vector = embeddings_index.get(word)
    if h_embedding_vector is not None:
        headline_embedding_matrix[i] = h_embedding_vector

### Creating training and testing sets from our data for both title and headline respectively. I have used 15% of the data for testing

In [None]:
x_train_title, x_valid_title, Y_train_title, y_valid_title = train_test_split(padded_title, y_train_title, shuffle = True, test_size = 0.15)

x_train_headline, x_valid_headline, Y_train_headline, y_valid_headline = train_test_split(padded_headline, y_train_headline, shuffle = True, test_size = 0.15)

In [None]:
import math
from math import exp
from keras import backend as K

### Defining a custom activation function by changing the pre-existing tanh parameter

In [None]:
def mod_tanh(x):
    return K.tanh(0.6*x)

### Defining separate LSTM Networks for Title and Headline

**Some key novelties in the network:**
- The loss function used for the network is `mean squared error`, the reason being that the output was required to be continuous
- The activation function used in the last layer of the network was a custom `tanh` function defined above, because the outputs were required in the range of [-1, 1]

In [None]:
# Model for title
title_model = Sequential()
title_model.add(Embedding(vocab_size_title, 50, input_length=max_len_title, weights=[title_embedding_matrix], trainable=True))
title_model.add(Bidirectional(LSTM(20, return_sequences=True)))
title_model.add(Dropout(0.3))
title_model.add(BatchNormalization())
title_model.add(Bidirectional(LSTM(20, return_sequences=True)))
title_model.add(Dropout(0.3))
title_model.add(BatchNormalization())
title_model.add(Bidirectional(LSTM(20)))
title_model.add(Dropout(0.3))
title_model.add(BatchNormalization())
title_model.add(Dense(64, activation='relu'))
title_model.add(Dense(64, activation='relu'))
title_model.add(Dense(1, activation=mod_tanh))
title_model.compile(loss='mse', optimizer='adam', metrics=['mse', 'mae'])

In [None]:
# Model for Headline
headline_model = Sequential()
headline_model.add(Embedding(vocab_size_headline, 50, input_length=max_len_headline, weights=[headline_embedding_matrix], trainable=True))
headline_model.add(Bidirectional(LSTM(20, return_sequences=True)))
headline_model.add(Dropout(0.3))
headline_model.add(BatchNormalization())
headline_model.add(Bidirectional(LSTM(20, return_sequences=True)))
headline_model.add(Dropout(0.3))
headline_model.add(BatchNormalization())
headline_model.add(Bidirectional(LSTM(20)))
headline_model.add(Dropout(0.3))
headline_model.add(BatchNormalization())
headline_model.add(Dense(64, activation='relu'))
headline_model.add(Dense(64, activation='relu'))
headline_model.add(Dense(1, activation=mod_tanh))
headline_model.compile(loss='mse', optimizer='adam', metrics=['mse', 'mae'])

### Title model training

In [None]:
title_model.fit(x_train_title, Y_train_title, epochs = 10)

### Headline model training

In [None]:
headline_model.fit(x_train_headline, Y_train_headline, epochs = 10)

### Now we shall predict on the validation sets and then see what score we obtain

In [None]:
title_valid_pred = title_model.predict(x_valid_title)

In [None]:
headline_valid_pred = headline_model.predict(x_valid_headline)

### Calculating the Mean Absolute errors for both Title and Headline sentiments

In [None]:
from sklearn.metrics import mean_absolute_error
mae_title=mean_absolute_error(y_valid_title,title_valid_pred)
mae_headline=mean_absolute_error(y_valid_headline,headline_valid_pred)

### Here we caclulate our final score. Score is calulated as

**max(0, 1 - ((0.4(mean abs error of title)+(0.6(mean abs error of headline)))**

In [None]:
score=1-((0.4*mae_title)+(0.6*mae_headline))

In [None]:
print("Score = {} \nScore(out of 100%) = {}%".format(score,round(score*100, 2)))

### We achieved a score of 93.15%

### This score is an indication of how close our predicted values were to the target values. It cannot exacly be termed as accurcacy, because this is not a classification problem. Our sentiment score is a real number between -1 and 1