In [None]:
# https://appliedmachinelearning.blog/2019/10/31/event-vs-non-event-classification-in-financial-documents-text-classification/

In [1]:
# %load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
#!pip install --upgrade tensorflow keras

<IPython.core.display.Javascript object>

In [3]:
import pandas as pd
import numpy as np

<IPython.core.display.Javascript object>

In [4]:
import pickle
import numpy as np
from sklearn.svm import SVC
from sklearn import metrics
from itertools import product
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
import keras.preprocessing.text as kpt

Using TensorFlow backend.


<IPython.core.display.Javascript object>

In [5]:
labelled_df = pd.read_excel(
    "30_companies_filings_event_nonevent_final_data.xlsx", index_col=False
)
labelled_df.head()

Unnamed: 0,cid,is_event,context,Date,line_id
0,354950,1,"Since April 19, 1984, our common stock has bee...","['April 19, 1984']",45
1,934549,1,"Acacia was incorporated on January 25, 1993 un...","['January 25, 1993']",2400
2,899629,1,Acadia Realty Trust (the Trust) was formed on ...,"['March 4, 1993']",4031
3,926282,1,"Pursuant to the program, on January 13, 1995, ...","['January 13, 1995']",2832
4,926282,1,"Pursuant to the program, on January 13, 1995, ...","['January 13, 1995']",2712


<IPython.core.display.Javascript object>

In [6]:
labelled_df["is_event"].value_counts()

0    1382
1    1046
2       5
3       1
Name: is_event, dtype: int64

<IPython.core.display.Javascript object>

In [7]:
labelled_df.columns

Index(['cid', 'is_event', 'context', 'Date', 'line_id'], dtype='object')

<IPython.core.display.Javascript object>

In [8]:
labelled_df["is_event"] = np.where(labelled_df["is_event"] >= 1.0, 1, 0)
labelled_df["is_event"].value_counts()

0    1382
1    1052
Name: is_event, dtype: int64

<IPython.core.display.Javascript object>

In [9]:
labelled_df = labelled_df.drop_duplicates(subset="context", keep="first")
labelled_df["is_event"].value_counts()

0    1319
1     891
Name: is_event, dtype: int64

<IPython.core.display.Javascript object>

In [10]:
text_lines = labelled_df["context"]
Y = labelled_df["is_event"]

text_lines[0]

'Since April 19, 1984, our common stock has been listed on the NYSE, trading under the symbol "HD".'

<IPython.core.display.Javascript object>

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    text_lines, Y, test_size=0.2, stratify=Y, random_state=42
)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X_train)
print("Shape of X = ", X.shape)
print("Size of labels = ", len(y_train))

Shape of X =  (1768, 4044)
Size of labels =  1768


<IPython.core.display.Javascript object>

In [12]:
# Initializing the model
clf = SVC(kernel="linear", class_weight="balanced", probability=True)

# Training the model
clf.fit(X, y_train)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

<IPython.core.display.Javascript object>

In [13]:
def convert_text_to_index_array(text):
    return [dictionary[word] for word in kpt.text_to_word_sequence(text)]


# Create Dictionary of words and their indices
max_words = 4000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(text_lines)
dictionary = tokenizer.word_index

# Replace words of each text review to indices
allWordIndices = []
for num, text in enumerate(X_train):
    wordIndices = convert_text_to_index_array(text)
    if num == 0:
        print(text)
    allWordIndices.append(wordIndices)

# truncate and pad input sequences
max_sent_length = 100
train_X = sequence.pad_sequences(allWordIndices, maxlen=max_sent_length)

print("\n Word Index array \n")
print(allWordIndices[0])
print("\n Feature Matrix \n")
print(train_X[0, :])
print("\n Shape of feature matrix = ", train_X.shape)

At December 31, 2017, these investments included corporate bonds of $32.5 million, municipal fixed-rate bonds of $2.9 million, asset-backed bonds of $6.5 million, mortgage/agency-backed bonds of $5.5 million, U.S. government bonds of $14.3 million, and foreign government bonds of $0.7 million.

 Word Index array 

[18, 8, 10, 4, 202, 239, 126, 225, 462, 2, 931, 43, 12, 1641, 664, 81, 462, 2, 42, 86, 12, 384, 1642, 462, 2, 66, 43, 12, 1099, 1188, 1642, 462, 2, 43, 43, 12, 71, 60, 721, 462, 2, 210, 49, 12, 3, 211, 721, 462, 2, 24, 53, 12]

 Feature Matrix 

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0   18    8   10    4  202  239  126  225
  462    2  931   43   12 1641  664   81  462    2   42   86   12  384
 1642  462    2   66   43   12 1099 1188 1642  462    2   43   43   12
   71   60  

<IPython.core.display.Javascript object>

In [14]:
embedding_length = 32
model = Sequential()
model.add(Embedding(max_words, embedding_length, input_length=max_sent_length))
model.add(LSTM(50))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
print(model.summary())
model.fit(train_X, y_train, epochs=10, batch_size=32)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 32)           128000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                16600     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
Total params: 144,651
Trainable params: 144,651
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10


InvalidArgumentError:  indices[12,78] = 4291 is not in [0, 4000)
	 [[node embedding_1/embedding_lookup (defined at /opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py:1751) ]] [Op:__inference_keras_scratch_graph_1942]

Function call stack:
keras_scratch_graph


<IPython.core.display.Javascript object>

In [None]:
def plot_confusion_matrix(cm, cmap=plt.cm.coolwarm):
 
    """
    Description : This function prints and plots the confusion matrix.
    Arguments :   Confusion Matrix.
    """
 
    classes = [0, 1]
    title = 'Confusion matrix without normalization'
 
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
 
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
 
    plt.tight_layout()    
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
 

In [None]:
def calculate_accuracy_metrics(y_test,y_pred):
 
    """
    Description: Calculates and print accuracy metrics like precision, recall, f1_score, accuracy
    Arguments: True and Predicted labels
    """
 
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    fscore = metrics.f1_score(y_test, y_pred)
    print("Precision = %.2f" % precision, "Recall = %.2f" % recall)
    print("F1_score = %.2f" % fscore, "Accuracy = %.2f" % (np.sum(y_pred==y_test)/len(y_test)))

In [None]:
x_test = vectorizer.transform(X_test)
y_pred = clf.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
calculate_accuracy_metrics(y_test, y_pred)
plot_confusion_matrix(cm)

In [None]:
# Replace words of each text review to indices
testWordIndices = []
for num,text in enumerate(X_test):
    wordIndices = convert_text_to_index_array(text)
    testWordIndices.append(wordIndices)
 
test_X = sequence.pad_sequences(testWordIndices, maxlen=max_sent_length)
 
y_pred = model.predict(test_X)
y_pred = np.where(y_pred &amp;gt; 0.5,1,0).ravel()
 
cm = confusion_matrix(y_test, y_pred)
calculate_accuracy_metrics(y_test, y_pred)
plot_confusion_matrix(cm)

In [None]:
Modelfilename = 'event_classification_model.pkl'
pickle.dump(clf, open(Modelfilename, 'wb'))
 
Vectorizerfilename = 'event_vectorizer_model.pkl'
pickle.dump(vectorizer, open(Vectorizerfilename, 'wb'))