## Pre-requirements and presentation functions

## https://simpletransformers.ai/docs/usage/#loading-a-local-save

xlnet based cased - 50 - 78% on 2nd epoch

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import pandas as pd
import numpy as np
import os

# figure plotting
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "figures"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
df_fpb = pd.read_csv("./data/train_set_df_9453.csv")

In [3]:
df_fpb

Unnamed: 0,text,label
0,There Are Some Itchy Selling Triggers Out Ther...,0
1,Sentiment Is Shifting In The Markets: Thornton...,0
2,Stocks crumble as more nations shut for busine...,0
3,Howard Marks: It's Hard to Find Good Risk/Retu...,1
4,"People too negative about odds of recession, s...",0
...,...,...
9448,Operating result for the 12-month period decre...,0
9449,HELSINKI Thomson Financial - Shares in Cargote...,0
9450,LONDON MarketWatch -- Share prices ended lower...,0
9451,Operating profit fell to EUR 35.4 mn from EUR ...,0


In [4]:
df_fpb.columns = ['text', 'labels']

In [5]:
len(df_fpb)

9453

In [6]:
df_fpb = sklearn.utils.shuffle(df_fpb, random_state=42)


In [7]:
df_fpb.head()

Unnamed: 0,text,labels
3470,"RockCreek's Beschloss Says Fed, Better-Than-Ex...",2
1608,Market recap Thursday Sept 24: Dow swings 500 ...,2
8223,The exercise price of the option will be based...,1
7411,Sales are expected to increase in the end of t...,2
7307,Return on investment was 16.6 % compared to 15...,2


In [8]:
#"""Changed the getlabel function in binaryprocessor class to have 3 labels, negative, neutral, positive"""
#df_fpb['Rating'] = df_fpb['Rating'].replace('negative',0)
#df_fpb['Rating'] = df_fpb['Rating'].replace('neutral',1)
#df_fpb['Rating'] = df_fpb['Rating'].replace('positive',2)

In [9]:
df_fpb

Unnamed: 0,text,labels
3470,"RockCreek's Beschloss Says Fed, Better-Than-Ex...",2
1608,Market recap Thursday Sept 24: Dow swings 500 ...,2
8223,The exercise price of the option will be based...,1
7411,Sales are expected to increase in the end of t...,2
7307,Return on investment was 16.6 % compared to 15...,2
...,...,...
5734,Facebook's alleged use of APIs to crush compet...,0
5191,G20 agrees to wrap up digital tax rules on tec...,0
5390,Facebook lawsuits: the biggest tech battle yet...,0
860,Is a Trade Deal Key to Market Continuation? - ...,1


In [10]:
from sklearn.model_selection import train_test_split

# 72% train, 8% eval, 20% test
df_train_temp, df_test = train_test_split(df_fpb, test_size=0.2, random_state=42)

df_train, df_eval = train_test_split(df_fpb, test_size=0.1, random_state=42)


In [11]:
df_train

Unnamed: 0,text,labels
5919,Facebook's (FB) CEO Mark Zuckerberg on Q3 2020...,1
6381,"The contract covers the manufacturing , surfac...",1
6230,"Profit for the period was EUR 5.9 mn , up from...",2
798,Bitcoin nosedives 22% this week to its lowest ...,0
4106,Facebook shares jump 5% after strong results -...,2
...,...,...
9304,Finnish financial software solutions developer...,0
4769,"Europe's Old Media Unites in Google, Facebook ...",1
2451,US is concerned about Huawei's relationship wi...,0
5863,7 Hot Cloud Stocks to Buy Now For Long-Term Pr...,2


## BERT

In [12]:
import torch

In [14]:
from simpletransformers.classification import ClassificationModel

In [15]:
# Create a ClassificationModel
cuda_available = torch.cuda.is_available()

model = ClassificationModel(
    "roberta", "roberta-base", num_labels=3, args={"reprocess_input_data": True, "learning_rate":2e-5, "overwrite_output_dir": False, "num_train_epochs":1, "evaluate_during_training_verbose":True, "evaluate_during_training":True, "manual_seed":42, "max_seq_length":256}, use_cuda=cuda_available
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




In [None]:
# Train the model
model.train_model(df_train, eval_df=df_eval)

HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 1', max=1064.0, style=ProgressStyle(de…

In [None]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(df_test)

In [None]:
result

In [None]:
model_outputs

In [None]:
array = [[1.1,2.2,3.3],[2.2,3.3,4.4]]

In [None]:
import pandas as pd

df = pd.DataFrame(array, columns=['negtive','neutral','positive'])

In [None]:
df

In [None]:
len(model_outputs)

In [None]:
len(wrong_predictions)

In [None]:
df_train = df_train.reset_index()
df_test = df_test.reset_index()

In [None]:
df_test

In [None]:
# Uses the array model outputs to pick the location of the max one and thus the prediction
predictions = []
for i in model_outputs:
    predictions.append(np.argmax(i)) 

In [None]:
df_test['predictions']=predictions

In [None]:
df_test[['Rating','predictions']][0:8]

In [None]:
correct=[]
for index, row in df_test.iterrows():
    if(row['Rating'] == (row['predictions'])):
        correct.append('True')
    else:
        correct.append('False')

In [None]:
df_test['correct']=correct

In [None]:
from collections import Counter 

Counter(df_test['correct'])

In [None]:
df_test.loc[6].Text

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
df_test[0:7]

In [None]:
df_test['correct']