In [None]:
#Scientific Compurting Libraries
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(42)


#Feature engineering, processing and splitting libraries
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, RobustScaler, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from sklearn.feature_selection import RFE, VarianceThreshold
from sklearn.model_selection import GridSearchCV, KFold, RandomizedSearchCV, StratifiedKFold,cross_val_score
from sklearn.metrics import r2_score
sns.set_context("paper", font_scale = 1, rc={"grid.linewidth": 3})
pd.set_option('display.max_rows', 100, 'display.max_columns', 400)
from scipy.stats import skew,boxcox_normmax
from scipy.special import boxcox1p
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

#Gradient Boosting and Bagging Techniques
from xgboost import XGBRFRegressor,XGBRegressor
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingCVRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB

#Text to numerical features - ML algorithms
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#Pipelines
from sklearn.pipeline import Pipeline, FeatureUnion

#Sklearn metrics for classification
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


#Undersampling and oversampling
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTETomek

#Saving models to file
import joblib, pickle

In [None]:
#Importing keras models and layers
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalAveragePooling1D, SimpleRNN, GRU, LSTM, Dropout, SpatialDropout1D, Bidirectional, BatchNormalization


#Text preprocessing
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences

In [None]:
#Importing the datasets
train_data = pd.read_csv('/content/drive/MyDrive/Datasets/Indian Financial News Headlines/data/processed/turney_v1.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Datasets/Indian Financial News Headlines/data/processed/test_v1.csv')

In [None]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,Date,Title,Description,Year,Clean_Title,Clean_Desc,Combined_Text,Combined_Tag_Tokens,Title_Noun_Count,Title_Adverb_Count,Title_Adjective_Count,Title_Verb_Count,Jaccard_Index,num_words,num_unique_words,num_chars,num_words_upper,num_words_title,mean_word_len,Phrases,Positive_Hits,Negative_Hits,Label,Label_Final
0,0,2020-05-26,"ATMs to become virtual bank branches, accept d...","Close to 14.6 per cent (or 35,000) of the 240,...",2020,ATMs become virtual bank branch accept deposit...,close 14 6 per cent 35 000 240 000 ATMs India ...,ATMs become virtual bank branch accept deposit...,"[('ATMs', 'NNP'), ('become', 'VBP'), ('virtual...",10,1,2,2,0.041667,25,23,143,0,1,4.76,"['virtual bank', 'instant creditclose', 'new a...","[1, 1, 3]","[1, 1, 4]",-0.2592,0
1,1,2020-05-26,IDFC First Bank seniors to forgo 65% of bonus ...,"V Vaidyanathan, managing director and chief ex...",2020,IDFC First Bank senior forgo 65 bonus amid Cov...,V Vaidyanathan managing director chief executi...,IDFC First Bank senior forgo 65 bonus amid Cov...,"[('IDFC', 'NNP'), ('First', 'NNP'), ('Bank', '...",13,0,2,4,0.0,24,24,156,1,4,5.541667,"['senior forgo', 'managing director', 'directo...","[1, 5, 2, 10, 1]","[1, 11, 4, 8, 1]",-0.483969,0
2,2,2020-05-25,"Huge scam in YES Bank for many years, says Enf...",Rana Kapoor's wife also charged with abetting ...,2020,huge scam YES Bank many year say Enforcement D...,Rana Kapoor wife also charged abetting crime,huge scam YES Bank many year say Enforcement D...,"[('huge', 'JJ'), ('scam', 'NNS'), ('YES', 'NNP...",9,1,2,3,0.0,15,15,100,1,3,5.733333,"['huge scam', 'many year', 'also charged']","[1, 1, 1]","[1, 2, 1]",-0.454188,0
3,3,2020-05-24,"Bank of Maharashtra sanctioned Rs 2,789 cr in ...",The bank said it was now gearing up to extend ...,2020,Bank Maharashtra sanctioned Rs 2 789 cr loan M...,the bank said gearing extend stimulus package ...,Bank Maharashtra sanctioned Rs 2 789 cr loan M...,"[('Bank', 'NNP'), ('Maharashtra', 'NNP'), ('sa...",16,0,1,3,0.0,23,23,157,0,7,5.869565,"['bank said', 'gearing extend', 'package annou...","[11, 1, 1]","[11, 1, 1]",-0.120854,0
4,4,2020-05-23,DCB Bank's profit before tax declines 37.6% to...,Net profit for the financial year ended March ...,2020,DCB Bank profit tax decline 37 6 Rs 93 84 cror...,net profit financial year ended March 31 2020 ...,DCB Bank profit tax decline 37 6 Rs 93 84 cror...,"[('DCB', 'NNP'), ('Bank', 'NNP'), ('profit', '...",16,1,1,2,0.16,30,24,149,2,5,4.0,"['profit financial', 'financial year']","[1, 22]","[0, 22]",0.0,1


Steps for word embedding:


1.   One-Hot Encode the document corpus
2.   Pad the sequences of words for a fixed length sequence
3.   Apply the Keras **Word Embedding** layer.



Different types of Word Embeddings are available. Some of them include:
1. Gensim Word-2-Vec(computes vectors based on cosine similarity)
2. GLOVE Word Embeddings
3. FASTText embeddings

In [None]:
#Splitting the dataset and building the features
X_train, X_valid, y_train, y_valid = train_test_split(train_data.Combined_Text.values, train_data['Label_Final'], test_size=0.2, shuffle=True, stratify=train_data['Label_Final'])

In [None]:
# #Storing the corpus for appropriate Word Embedding
# corpus = []
# for row in range(len(train_data.index)):
#   corpus.append(' '.join(str(text) for text in train_data.loc[row, ['Combined_Text']]))

**One-Hot Representation**

In [None]:
#Pre-defining the vocabulary size to be 10000, sentence 
vocab_size = 10000
sent_length = 25
embedding_vector_features = 40

In [None]:
#Encodes on the basis of corpus size
one_hot_encode_train = [one_hot(_sent, vocab_size) for _sent in X_train]
one_hot_encode_valid = [one_hot(_sent, vocab_size) for _sent in X_valid]

In [None]:
embedded_encode_train = pad_sequences(one_hot_encode_train, padding='post', maxlen=sent_length)
embedded_encode_valid = pad_sequences(one_hot_encode_valid, padding='post', maxlen=sent_length)

In [None]:
#Finding out the maximum vocabulary size
embedded_list_train= list(embedded_encode_train)
max_values = []
for seq in embedded_list_train:
  _max = max(seq)
  max_values.append(_max)
print(max(max_values))

9999


In [None]:
embedded_encode_train

array([[5216, 7006, 9225, ..., 4665,  347, 8342],
       [7266, 1166, 2317, ...,  353, 8265,    0],
       [7266, 2876, 8952, ...,    0,    0,    0],
       ...,
       [4131, 9971, 4549, ...,    0,    0,    0],
       [2755, 4515, 9971, ...,    0,    0,    0],
       [5850, 6586, 5313, ...,    0,    0,    0]], dtype=int32)

# Defining some base-models

1. Creating the base LSTM model, calculate and plot final summary statistics later


In [None]:
#Try more models and save them later
#Instantiating the model
base_model1 = Sequential()

#Embedding Layer
base_model1.add(Embedding(vocab_size, embedding_vector_features, input_length=sent_length))

#LSTM layer
base_model1.add(LSTM(40))

#Dense layer
base_model1.add(Dense(1, activation='sigmoid'))

#Building the model and printing its statistics
base_model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(base_model1.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 40)            400000    
_________________________________________________________________
lstm (LSTM)                  (None, 40)                12960     
_________________________________________________________________
dense (Dense)                (None, 1)                 41        
Total params: 413,001
Trainable params: 413,001
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
print(X_train.shape)
print(y_train.shape)

(8000,)
(8000,)


In [None]:
X_train

array(['no pre IPO placement HDFC Mutual Fund distributorthe mutual fund house manages asset Rs 3 trillion made private placement share worth Rs 1 5 billion',
       'SBI Research peg Q3 GDP growth low 5 8 FY17 6 6from next year growth could move faster demand come back faster post remonetisation',
       'SBI merger create banking powerhouseEmployee integration branch rationalisation major challenge',
       ...,
       'IDBI Bank expects balance sheet stability March 2019with capital infusion government turnaround plan place IDBI Bank expects stability return balance',
       'South indian Bank raise Rs 500 crore augment business capitalthe bank scrip closed 0 71 Rs 21 25 BSE',
       'RBI begin corrective action UCO Bank high bad loanUCO reported narrowing quarterly loss bad loan ratio'],
      dtype=object)

In [None]:
#Performing the training operation. We will calculate the final summary statistics later
history = base_model1.fit(embedded_encode_train, y_train, validation_data=(embedded_encode_valid, y_valid), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Just for testing purposes
# text = "It rained heavily today"
# oh_encoded = [one_hot(text, vocab_size)]
# embedded_encoded = pad_sequences(oh_encoded, padding='post', maxlen=sent_length)
# embedded_encoded

array([[ 326, 9358, 3442, 7860,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0]], dtype=int32)

In [None]:
#Save the base_model1 to file
base_model1.save('/content/drive/MyDrive/Datasets/Indian Financial News Headlines/src/models/saved models/RNN_basic')



INFO:tensorflow:Assets written to: /content/drive/MyDrive/Datasets/Indian Financial News Headlines/src/models/saved models/RNN_basic/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Datasets/Indian Financial News Headlines/src/models/saved models/RNN_basic/assets


Here we can see that the model has overfitted on training set. We need to do some Hyperparameter tuning by providing some spatial pooling and dropout layers.

2. Basic NN model

In [None]:
#Instantiate NN model
base_model2 = Sequential()

#Embedding layer
base_model2.add(Embedding(vocab_size, embedding_vector_features, input_length=sent_length))

#Dropout layer
base_model2.add(Dropout(0.3))

#Adding Dense layer with ReLU activation
base_model2.add(Dense(6, activation='relu'))

#Adding a final Dense layer for binary classification
base_model2.add(Dense(1, activation='sigmoid'))

#Building the model and printing its statistics
base_model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(base_model2.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 40)            400000    
_________________________________________________________________
dropout (Dropout)            (None, 25, 40)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 25, 6)             246       
_________________________________________________________________
dense_2 (Dense)              (None, 25, 1)             7         
Total params: 400,253
Trainable params: 400,253
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
#Performing the training operation. We will calculate the final summary statistics later
history = base_model2.fit(embedded_encode_train, y_train, validation_data=(embedded_encode_valid, y_valid), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#Save the base_model2 to file
base_model2.save('/content/drive/MyDrive/Datasets/Indian Financial News Headlines/src/models/saved models/NN_basic')

INFO:tensorflow:Assets written to: /content/drive/MyDrive/Datasets/Indian Financial News Headlines/src/models/saved models/NN_basic/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Datasets/Indian Financial News Headlines/src/models/saved models/NN_basic/assets


Hence we observe that the base ANN model outperforms the accuracy of base LSTM model, just by adding Dropout layers which reduces overfitting.

<!-- 3.Base Bidirectional LSTM model -->

In [None]:
# # instantiate RNN model
# rnn_model = Sequential()

# # add embedding layer 
# rnn_model.add(Embedding(vocab_size, embedding_vector_features, input_length=sent_length))


# #set the dropout layer to drop out 50% of the nodes, works well with sequence data
# rnn_model.add(SpatialDropout1D(0.5))

# # add bidirectional layer and pass in an LSTM()
# rnn_model.add(Bidirectional(LSTM(25, return_sequences=True)))

# # add normalization layer
# rnn_model.add(BatchNormalization())

# # add pooling layer 
# rnn_model.add(GlobalAveragePooling1D())

# # set the dropout layer to drop out 50% of the nodes
# rnn_model.add(Dropout(0.5))

# # add dense layer to produce an output dimension of 50 and using relu activation
# rnn_model.add(Dense(50, activation='relu'))

# # finally add a dense layer
# rnn_model.add(Dense(6, activation='sigmoid'))

# #Building the model and printing its statistics
# rnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# print(rnn_model.summary())