## Pre-requirements and presentation functions

## https://medium.com/swlh/a-simple-guide-on-using-bert-for-text-classification-bbf041ac8d04 (version used for binary classification in mlt)

In [19]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import pandas as pd
import numpy as np
import os

# figure plotting
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "figures"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [20]:
def plot_confusion_matrix(cm, classes, title, normalize=False, cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label') 
    plt.title(title)

In [21]:
df_fpb = pd.read_csv("./data/financial-phrase-bank-v1.0/Sentences_66Agree.txt", sep='@',encoding='latin-1', names=['Text','Rating'])

In [22]:
df_fpb.head()

Unnamed: 0,Text,Rating
0,"According to Gran , the company has no plans t...",neutral
1,Technopolis plans to develop in stages an area...,neutral
2,With the new production plant the company woul...,positive
3,According to the company 's updated strategy f...,positive
4,"For the last quarter of 2010 , Componenta 's n...",positive


In [23]:
len(df_fpb)

4217

In [24]:
"""Changed the getlabel function in binaryprocessor class to have 3 labels, negative, neutral, positive"""
#df_fpb['Rating'] = df_fpb['Rating'].replace('negative',0)
#df_fpb['Rating'] = df_fpb['Rating'].replace('neutral',1)
#df_fpb['Rating'] = df_fpb['Rating'].replace('positive',2)

'Changed the getlabel function in binaryprocessor class to have 3 labels, negative, neutral, positive'

In [25]:
df_fpb = sklearn.utils.shuffle(df_fpb, random_state=42)


In [26]:
df_fpb

Unnamed: 0,Text,Rating
463,Tielinja generated net sales of 7.5 mln euro $...,neutral
2426,"Cohen & Steers , Inc. : 5 534 626 shares repre...",neutral
2661,"SAN FRANCISCO ( MarketWatch ) -- Nokia Corp , ...",neutral
1483,Raute said it has won an order worth around 15...,positive
2860,"The power supplies , DC power systems and inve...",neutral
...,...,...
3444,To see a slide show of all the newest product ...,neutral
466,"Under the rental agreement , Stockmann was com...",neutral
3092,"Eero Katajavuori , currently Group Vice Presid...",neutral
3772,The floor area of the Yliopistonrinne project ...,neutral


In [27]:
from sklearn.model_selection import train_test_split

df_train_unprocessed, df_test_unprocessed = train_test_split(df_fpb, test_size=0.2)

## BERT

In [28]:
train_df_bert = pd.DataFrame({
    'id':range(len(df_train_unprocessed)),
    'label':df_train_unprocessed['Rating'],
    'alpha':['a']*df_train_unprocessed.shape[0],
    'text': df_train_unprocessed['Text']
})

In [29]:
train_df_bert

Unnamed: 0,id,label,alpha,text
1029,0,neutral,a,Chief Financial Officer Jim Heindlmeyer said B...
1651,1,negative,a,Danish company FLSmidth has acknowledged that ...
4062,2,negative,a,In a separate announcement to the Helsinki sto...
1011,3,neutral,a,Also Chile is an important market area for for...
3538,4,negative,a,"In October , UPM reported a third-quarter net ..."
...,...,...,...,...
81,3368,positive,a,Finnish Aktia Group 's operating profit rose t...
2552,3369,neutral,a,Jon Risfelt has previously held operational ex...
763,3370,positive,a,Nokia controls more than 50 percent of phone s...
4156,3371,neutral,a,`` Low energy consumption and flexible loading...


In [30]:
test_df_bert = pd.DataFrame({
    'id':range(len(df_test_unprocessed)),
    'label':df_test_unprocessed['Rating'],
    'alpha':['a']*df_test_unprocessed.shape[0],
    'text': df_test_unprocessed['Text']
})

In [31]:
test_df_bert

Unnamed: 0,id,label,alpha,text
357,0,positive,a,The group 's 12-month operating profit grew 31...
2657,1,neutral,a,"s business sectors are building construction ,..."
2614,2,neutral,a,Okmetic has used the furnaces for the contract...
2303,3,neutral,a,JVC will stop producing DVD players in Brazil ...
895,4,neutral,a,All are welcome .
...,...,...,...,...
827,839,positive,a,"As a result , the distribution companies will ..."
1839,840,neutral,a,"The agreement , which will cover monitoring , ..."
3420,841,neutral,a,The unit 's clients are mainly in the field of...
2036,842,neutral,a,"Cramo , headquartered in Vantaa , Finland , re..."


In [32]:
print(test_df_bert['text'].str.len().max())
print(len(test_df_bert))
print(len(train_df_bert))

296
844
3373
