# Using Embeddings for Financial Sentiment Classification
Testing out using ELMO embeddings for financial phrase bank classification.

Recycling tutorial example from pymagnitude: https://colab.research.google.com/drive/1lOcAhIffLW8XC6QsKzt5T_ZqPP4Y9eS4. 
Note, the original example is super old (Python2!!) and the keras model just doesn't generalize well. So I averaged the vectors and used good ol Random Forest.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [None]:
!pip install -q pymagnitude #tensorflow keras
#glove: !curl -s http://magnitude.plasticity.ai/glove+subword/glove.6B.50d.magnitude --output vectors.magnitude
#word2vec: !curl -s http://magnitude.plasticity.ai/word2vec+subword/GoogleNews-vectors-negative300.magnitude --output vectors.magnitude
#fastText:  !curl -s http://magnitude.plasticity.ai/fasttext+subword/wiki-news-300d-1M.magnitude --output vectors.magnitude
#elmo light: !curl -s http://magnitude.plasticity.ai/elmo/light/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude --output vectors.magnitude
!curl -s http://magnitude.plasticity.ai/elmo/light/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude --output vectors.magnitude

In [None]:
from pymagnitude import *
from tqdm import notebook

In [None]:
MAX_WORDS = 30 # The maximum number of words the sequence model will consider
vectors = Magnitude('./vectors.magnitude', pad_to_length = MAX_WORDS)

In [None]:
df=pd.read_csv('/kaggle/input/sentiment-analysis-for-financial-news/FinancialPhraseBank/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt',encoding = "ISO-8859-1", names=['text','sentiment'], delimiter= '@')
df.head(2)

In [None]:
def avg_vec(df):
    vctrLs = []
    for txt in notebook.tqdm(df.text.values): vctrLs.append(np.average(vectors.query(txt.split(' ')), axis = 0))
    return np.array(vctrLs)

In [None]:
train=df.sample(frac=0.8,random_state=42)
test=df.drop(train.index)

In [None]:
xTrn,xTest=avg_vec(train),avg_vec(test)

### Modelling - Good Ol Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [None]:
forest = RandomForestClassifier(n_estimators=100, random_state=0, max_features=0.5, 
                                max_depth=4 ,min_samples_split=5,
                                oob_score=True, n_jobs=-1, min_samples_leaf=50)

In [None]:
def oneHot(row):
  if row=='negative': return -1
  if row=='neutral' : return  0
  if row=='positive' : return +1

In [None]:
train.sentiment,test.sentiment=train.sentiment.apply(oneHot),test.sentiment.apply(oneHot)

In [None]:
forest.fit(xTrn, train.sentiment)

In [None]:
print("Accuracy on training set: {:.3f}".format(forest.score(xTrn, train.sentiment)))
oldscore = forest.oob_score_
print(f'OOB score is {oldscore*100:.1f}%')
#print('Out-of-bag score estimate: {:.3}'.format())

In [None]:
y_predict = forest.predict(xTest)
confusion_matrix(test.sentiment, y_predict)


cm = confusion_matrix(test.sentiment, y_predict)
print("Confusion matrix:\n{}".format(cm))


#Show confusion matrix in a separate window
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
print("Accuracy on test set: {:.3f}".format(forest.score(xTest, test.sentiment)))

### Predicting with the Model
Since the model has been trained successfully, we can evaluate its performance on some test queries using Magnitude to convert the test queries into a sequence of vectors that be passed directly into the model for inference (prediction).****

In [None]:
test.text.values[0]

In [None]:
x=[np.average(vectors.query(test.text.values[0].split(' ')), axis = 0)]
forest.predict(x)

In [None]:
test.sentiment.values[0]