# Computing Sentiment

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Input, Bidirectional, LSTM, Dense, TimeDistributed, Dropout
from tensorflow.keras import layers

from plot_model import plot_model


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

from transformers import TFBertModel, BertTokenizer

2023-06-08 10:25:27.716240: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

2023-06-08 10:25:38.035835: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from 

In [74]:
def custom_tokenizer(sentences, tokenizer, seq_len = 256): 
    '''
    This function takes all input sentences and creates a batch of tokens.
    The batch size is same as the number of inputs.
    We truncate all inpnut_ids of length greater than seq_len. We will not split large texts
    
    We used seq_len = 256 while training the FSAB classification model
    '''
    
    batch_size = len(sentences)
    input_ids = []
    attention_mask = []
    
    for sentence in sentences:
        
        tokens = tokenizer.tokenize(sentence.numpy().decode()) # reading the text
        tokens = ['[CLS]']+tokens+['[SEP]'] # prepending and appending with classification and seperator tokens
        
        temp_input_ids = tokenizer.convert_tokens_to_ids(tokens) # create temporary list of input_ids
        
        while len(temp_input_ids) > seq_len: # Truncation
            sep = input_ids.pop()
            input_ids[-1] = sep
        
        temp_attention_mask = [1]*len(temp_input_ids) # create temporary attention mask
        
        while len(temp_input_ids) < seq_len: # Padding
            temp_input_ids.append(0)
            temp_attention_mask.append(0)
        
        input_ids = input_ids + temp_input_ids
        attention_mask = attention_mask + temp_attention_mask
    
    input_ids = tf.reshape(tf.convert_to_tensor(input_ids),[batch_size,seq_len])
    attention_mask = tf.reshape(tf.convert_to_tensor(attention_mask),[batch_size,seq_len])
    
    return input_ids, attention_mask # We are passing 2 separate tensors


In [59]:
news_df = pd.read_csv('MSFT_news_2020_onwards.csv')

In [70]:
news_df=news_df.replace(np.nan,'',regex=True)

We will use only the first 5 headlines as upon inspection, it appears that the later headlines are often not relevant for microsoft. Keeping irrelevant headlines may negatively impact the performance of our model

In [72]:
# Converting to tensorflow tensors

h1 = tf.constant(news_df['headline1'])
h2 = tf.constant(news_df['headline2'])
h3 = tf.constant(news_df['headline3'])
h4 = tf.constant(news_df['headline4'])
h5 = tf.constant(news_df['headline5'])

In [75]:
# converting to BERT inputs

h1_bert = custom_tokenizer(h1,tokenizer)
h2_bert = custom_tokenizer(h2,tokenizer)
h3_bert = custom_tokenizer(h3,tokenizer)
h4_bert = custom_tokenizer(h4,tokenizer)
h5_bert = custom_tokenizer(h5,tokenizer)

In [76]:
# Creating dataset from tensor slices to boost performance by enabling parallelization.

h1_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids':h1_bert[0],'attention_mask':h1_bert[1]}))

In [77]:
h1_dataset = h1_dataset.cache().batch(64).prefetch(tf.data.experimental.AUTOTUNE)

# Loading the trained model

In [78]:
# Creating the model

seq_len = 256

input_ids = Input(shape=(seq_len,), dtype=tf.int32, name='input_ids')
attention_mask = Input(shape=(seq_len,), dtype=tf.int32, name='attention_mask')

pooled_output = bert_model(input_ids, attention_mask=attention_mask)
dropout_layer = Dropout(0.2)(pooled_output[1])

hidden_layer = Dense(128,activation='tanh',name = 'hidden_layer')(dropout_layer)
dropout_layer_2 = Dropout(0.2)(hidden_layer)

classification_layer=Dense(3,activation=tf.nn.softmax, name = 'output_layer')(dropout_layer_2)

FSAB_model = tf.keras.Model(inputs=[input_ids,attention_mask], outputs=classification_layer)

In [81]:
# Loading weights

FSAB_model.load_weights('/Users/pushkal/Documents/work/Industry/preparation/Programming/Projects/Sentiment_analysis_BERT/notebooks/saved_weights/FSAB_05_25_2023_weights')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7fd8ead25750>

In [83]:
FSAB_model.trainable = False
FSAB_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           

In [84]:
sentiment1 = FSAB_model.predict(h1_dataset)



In [94]:
sentiment1[-10:]

array([[3.4670238e-04, 9.9414921e-01, 5.5041686e-03],
       [6.9052307e-04, 9.4470626e-01, 5.4603208e-02],
       [9.5169619e-03, 1.8551974e-02, 9.7193110e-01],
       [3.7030253e-04, 9.9285829e-01, 6.7713992e-03],
       [1.6079340e-02, 1.7321566e-02, 9.6659911e-01],
       [3.8657314e-04, 9.9735332e-01, 2.2601369e-03],
       [1.2095975e-03, 9.9602216e-01, 2.7682786e-03],
       [9.7170579e-01, 1.9373124e-02, 8.9210542e-03],
       [3.3465607e-04, 9.9836272e-01, 1.3025805e-03],
       [9.1149890e-01, 7.2986923e-02, 1.5514221e-02]], dtype=float32)

In [95]:
for i in range(len(sentiment1)-10,len(sentiment1)):
    print(news_df.iloc[i].headline1)

Brad Smith, Microsoft president, says he believes A.I. regulation will happen in the coming year
S. Koreas antitrust regulator approves Microsofts takeover of 
BHP unleashes the power of digital at worlds largest copper 
Microsoft Work Trend Index 2023: Singapore data unveils
Microsoft signs deal for A.I. computing power with Nvidia-backed CoreWeave that could be worth billions
Microsoft pens AI cloud computing deal with former Ethereum miner CoreWeave: CNBC
Goodbye, Cortana: Microsoft’s lively voice assistant will soon leave Windows
Microsoft faces uninsurable GDPR penalty
FTC Will Require Microsoft to Pay $20 million over Charges it
Microsoft to pay $20 million FTC fine over storage of Xbox information


## Comments:

Due to limited processing capabilities of the computer, we will run the code on google_colab and save the sentiments in a file. We will use that file as data when training the final model for stock price prediction.

Below, we will read the sentiment dataframe created on google colaboratory.

In [96]:
sentiment_df = pd.read_csv('/Users/pushkal/Documents/GitHub/stock_price_prediction/news_data/msft_sentiment_values_colab/sentiment_data_frame.csv')

In [98]:
sentiment_df = sentiment_df.set_index('date')
sentiment_df

Unnamed: 0_level_0,negative,neutral,positive
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-1-1,0.184596,0.610093,0.205312
2020-1-2,0.011839,0.962005,0.026156
2020-1-3,0.078713,0.418107,0.503180
2020-1-4,0.021134,0.623070,0.355795
2020-1-5,0.182176,0.718622,0.099203
...,...,...,...
2023-6-2,0.003831,0.791215,0.204954
2023-6-3,0.143502,0.853986,0.002513
2023-6-4,0.195494,0.800462,0.004044
2023-6-5,0.207483,0.772059,0.020458
