## Pre-requirements and presentation functions

## https://simpletransformers.ai/docs/usage/#loading-a-local-save

xlnet based cased - 50 - 78% on 2nd epoch

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import pandas as pd
import numpy as np
import os

# figure plotting
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "figures"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
def plot_confusion_matrix(cm, classes, title, normalize=False, cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label') 
    plt.title(title)

In [16]:
df_fpb = pd.read_csv("./data/financial-phrase-bank-v1.0/Sentences_66Agree.txt", sep='@',encoding='latin-1', names=['Text','Rating'])

In [17]:
df_fpb.head()

Unnamed: 0,Text,Rating
0,"According to Gran , the company has no plans t...",neutral
1,Technopolis plans to develop in stages an area...,neutral
2,With the new production plant the company woul...,positive
3,According to the company 's updated strategy f...,positive
4,"For the last quarter of 2010 , Componenta 's n...",positive


In [18]:
len(df_fpb)

4217

In [19]:
df_fpb = sklearn.utils.shuffle(df_fpb, random_state=42)


In [20]:
df_fpb.head()

Unnamed: 0,Text,Rating
463,Tielinja generated net sales of 7.5 mln euro $...,neutral
2426,"Cohen & Steers , Inc. : 5 534 626 shares repre...",neutral
2661,"SAN FRANCISCO ( MarketWatch ) -- Nokia Corp , ...",neutral
1483,Raute said it has won an order worth around 15...,positive
2860,"The power supplies , DC power systems and inve...",neutral


In [21]:
"""Changed the getlabel function in binaryprocessor class to have 3 labels, negative, neutral, positive"""
df_fpb['Rating'] = df_fpb['Rating'].replace('negative',0)
df_fpb['Rating'] = df_fpb['Rating'].replace('neutral',1)
df_fpb['Rating'] = df_fpb['Rating'].replace('positive',2)

In [22]:
df_fpb

Unnamed: 0,Text,Rating
463,Tielinja generated net sales of 7.5 mln euro $...,1
2426,"Cohen & Steers , Inc. : 5 534 626 shares repre...",1
2661,"SAN FRANCISCO ( MarketWatch ) -- Nokia Corp , ...",1
1483,Raute said it has won an order worth around 15...,2
2860,"The power supplies , DC power systems and inve...",1
...,...,...
3444,To see a slide show of all the newest product ...,1
466,"Under the rental agreement , Stockmann was com...",1
3092,"Eero Katajavuori , currently Group Vice Presid...",1
3772,The floor area of the Yliopistonrinne project ...,1


In [33]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_fpb, test_size=0.2, random_state=42)

In [24]:
df_train

Unnamed: 0,Text,Rating
3979,"Operating loss totalled EUR 0.9 mn , down from...",0
1446,Under a memorandum of understanding MoU the pa...,2
2941,There will be return flights from Stuttgart ev...,1
3284,The commission found evidence of several meeti...,1
1493,We are glad that our long co-operation with SO...,2
...,...,...
3804,All the ferries had run into trouble just outs...,0
217,Neste Oil Corp. has signed long-term procureme...,2
531,"Bilfinger investors cheered the agreement , pu...",2
4,"For the last quarter of 2010 , Componenta 's n...",2


## BERT

In [25]:
from simpletransformers.classification import ClassificationModel

In [26]:
# Create a ClassificationModel
cuda_available = torch.cuda.is_available()

model = ClassificationModel(
    "roberta", "distilroberta-base", num_labels=3, args={"reprocess_input_data": True, "overwrite_output_dir": False, "num_train_epochs":4, "evaluate_during_training_verbose":True, "evaluate_during_training":True, "manual_seed":42}, use_cuda=cuda_available
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [27]:
# Train the model
model.train_model(df_train)



HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 1', max=422.0, style=ProgressStyle(des…









(422, 0.4697775439584354)

In [28]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(df_test)



HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=106.0, style=ProgressStyle(descr…




In [29]:
result

{'mcc': 0.8160872702388109, 'eval_loss': 0.30303615147142476}

In [30]:
model_outputs

array([[-2.93231106,  1.4572773 ,  0.59554595],
       [-1.75883698, -1.97015369,  3.08119631],
       [-2.82724929,  3.23411942, -0.96780473],
       ...,
       [-3.31532764,  1.7296412 ,  0.99276274],
       [-1.89835274, -1.85678518,  3.12484837],
       [-3.48191452,  3.12305427, -0.3068603 ]])

In [1]:
array = [[1.1,2.2,3.3],[2.2,3.3,4.4]]

In [6]:
import pandas as pd

df = pd.DataFrame(array, columns=['negtive','neutral','positive'])

In [7]:
df

Unnamed: 0,negtive,neutral,positive
0,1.1,2.2,3.3
1,2.2,3.3,4.4


In [32]:
len(model_outputs)

844

In [31]:
len(wrong_predictions)

86

In [None]:
df_train = df_train.reset_index()
df_test = df_test.reset_index()

In [None]:
df_test

In [None]:
# Uses the array model outputs to pick the location of the max one and thus the prediction
predictions = []
for i in model_outputs:
    predictions.append(np.argmax(i)) 

In [None]:
df_test['predictions']=predictions

In [None]:
df_test[['Rating','predictions']][0:8]

In [None]:
correct=[]
for index, row in df_test.iterrows():
    if(row['Rating'] == (row['predictions'])):
        correct.append('True')
    else:
        correct.append('False')

In [None]:
df_test['correct']=correct

In [None]:
from collections import Counter 

Counter(df_test['correct'])

In [None]:
df_test.loc[6].Text

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
df_test[0:7]

In [None]:
df_test['correct']