In [None]:
!pip install transformers

In [None]:
import nltk
nltk.download('punkt')

In [39]:
from sklearn.linear_model import LogisticRegression

In [38]:
from sklearn.metrics import f1_score, classification_report, confusion_matrix

In [46]:
import pickle

In [3]:
import pandas as pd
import numpy as np

# Embedding creation

In [22]:
from fincat_utils import extract_context_words

In [23]:
from fincat_utils import bert_embedding_extract

In [8]:
# These datasets have been acquired from https://sites.google.com/nlg.csie.ntu.edu.tw/finnum3/data
train = pd.read_csv('FinNum-3_ConCall_train.csv')
valid = pd.read_csv('FinNum-3_ConCall_dev.csv')
train.head()

Unnamed: 0,paragraph,target_num,category,offset_start,offset_end,claim
0,Good day and welcome to the Apple Inc. Third Q...,2018.0,date,65,69,0
1,Please note that some of the information you'l...,10.0,other,504,506,0
2,Please note that some of the information you'l...,8.0,other,536,537,0
3,Thank you Nancy and thanks to everyone for joi...,53.3,money,212,216,0
4,Thank you Nancy and thanks to everyone for joi...,3.0,date,243,244,0


In [9]:
train['context_text'] = train.apply(lambda x : extract_context_words(x), axis = 1)
valid['context_text'] = valid.apply(lambda x : extract_context_words(x), axis = 1)

In [10]:
train['context_text'].head()

0    Apple Inc. Third Quarter Fiscal Year 2018 Earn...
1    recently filed periodic reports on Form 10-K a...
2    and Form 10-Q and the Form 8-K filed with the ...
3    Services and Wearables . We generated $53.3 bi...
4    53.3 billion in revenue a new Q3 record . That...
Name: context_text, dtype: object

In [25]:
def full_word_extract(x):
  paragraph, offset_start, offset_end = x['paragraph'], x['offset_start'], x['offset_end']
  paragraph = ' ' + paragraph + ' '
  offset_start = offset_start + 1
  offset_end = offset_end + 1
  prev_space_posn = (paragraph[:offset_start].rindex(' ')+1)
  end_space_posn = (offset_end+paragraph[offset_end:].index(' '))
  full_word = paragraph[prev_space_posn : end_space_posn]
  return full_word

train['full_word'] = train.apply(lambda x : full_word_extract(x), axis = 1)
valid['full_word'] = valid.apply(lambda x : full_word_extract(x), axis = 1)

In [26]:
X_train = train.apply(lambda x : bert_embedding_extract(x['context_text'], x['full_word']), axis = 1)
X_valid = valid.apply(lambda x : bert_embedding_extract(x['context_text'], x['full_word']), axis = 1)

In [33]:
X_train_df = pd.DataFrame([i for i in X_train.values])
X_train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767
0,0.368809,-0.183757,0.303547,-0.11702,0.06514,-0.029341,0.259314,0.290747,0.207858,-0.115907,0.264833,-0.160152,-0.231906,0.164245,-0.113761,-0.344708,0.103797,0.059213,0.155677,0.01019,0.03988,-0.420344,0.34715,0.203805,0.180735,0.036988,-0.27655,-0.629351,0.042352,-0.038613,-0.032722,0.279078,-0.510508,-0.272058,0.483989,-0.135982,0.039079,0.011199,-0.382494,0.409543,...,0.150129,-0.332466,0.146949,0.298941,-0.356629,0.256043,-0.346404,0.27661,0.474828,-0.504323,-0.341784,0.158858,0.093696,-0.367811,-0.19027,-0.131648,0.111341,0.355276,-0.327872,-0.465141,0.127224,-0.08562,0.326738,0.000606,-1.16802,-0.267588,-0.059047,-0.27891,-0.20477,-0.569015,-0.151098,0.356569,-0.289968,-0.577219,0.060462,-0.476315,-0.305444,-0.269589,0.029032,-0.141106
1,-0.229693,-0.089069,0.277564,0.208404,0.275113,-0.16126,0.140657,0.239605,0.087076,-0.271928,0.06007,-0.613772,-0.11344,0.15422,-0.19673,0.122445,0.657043,0.011915,0.35851,-0.003775,0.052378,-0.209894,0.099908,0.063633,-0.006309,-0.513957,-0.377309,0.337079,-0.198215,-0.149513,0.252597,0.311857,-0.386991,-0.164911,0.401473,-0.286149,0.142062,-0.369986,0.112666,-0.09829,...,-0.140012,-0.376107,0.218393,0.304788,-0.131884,0.064087,0.018192,0.251127,0.320116,-0.460189,0.238322,0.195535,-0.076593,-0.090142,0.074114,-0.321207,-0.535894,0.218806,0.062582,-0.059976,-0.154634,-0.327682,0.435243,0.334271,-0.5933,-0.369575,-0.465317,-0.305976,-0.351953,0.085012,0.161071,-0.098725,-0.099021,-0.451767,0.161913,-0.360084,-0.131953,0.31159,-0.00399,-0.047092
2,-0.312533,0.074236,0.340413,0.059623,0.269301,-0.430308,-0.01817,-0.077967,-0.054411,-0.024177,-0.086897,-0.399377,-0.172927,0.264951,-0.136291,-0.111428,0.279131,-0.062598,0.124166,-0.025297,0.142324,-0.318341,-0.009125,0.314666,0.186952,-0.195127,-0.259486,0.265007,-0.333785,-0.186305,0.335625,0.365368,-0.41394,-0.105751,0.368936,-0.234994,-0.061504,-0.369206,0.043243,-0.019501,...,-0.03877,-0.293589,0.059653,0.063383,-0.197656,0.295203,0.118613,0.369562,0.195081,-0.124045,0.556299,0.220525,0.036612,0.02655,0.213589,-0.629621,-0.506624,0.353618,-0.273439,-0.008737,-0.052315,-0.367065,0.36643,0.240781,-1.072272,-0.327259,-0.432634,-0.227883,-0.173199,-0.131844,0.364682,0.049291,-0.352903,-0.506028,-0.06277,-0.400013,0.027792,0.17891,-0.014725,-0.11212
3,-0.077108,-0.161483,0.31286,0.393249,-0.241845,-0.155996,0.190089,0.012113,-0.139313,-0.067047,-0.187884,-0.279766,0.068685,0.652245,0.034435,0.201013,0.196322,-0.138563,-0.108405,0.030554,-0.442437,-0.424446,0.225827,0.234261,0.55861,0.172792,-0.155304,-0.380085,-0.380848,-0.048957,0.250248,0.077043,-0.271357,0.021427,0.35351,-0.01362,-0.089485,-0.349548,-0.306434,-0.152709,...,-0.213981,-0.265999,-0.09584,-0.101439,-0.517161,-0.237204,-0.166519,0.128982,-0.059171,-0.128331,-0.268504,0.00098,0.172639,-0.039273,-0.138985,0.1092,-0.653843,-0.223973,-0.307335,0.033183,-0.250163,-0.113612,0.296702,-0.303115,-0.867403,-0.334606,-0.114193,-0.399088,0.285001,-0.124496,0.096827,0.510738,0.158721,-0.261529,0.392189,-0.147268,-0.073741,-0.110605,0.315011,0.099651
4,-0.063771,-0.050664,0.679189,-0.028069,-0.108447,-0.218079,0.338436,0.445809,-0.050631,-0.194607,-0.034786,0.094162,0.287489,0.418246,0.126156,-0.168809,-0.104691,-0.216247,-0.020723,-0.1996,0.034056,-0.365783,0.255971,0.269185,0.627106,-0.061137,-0.015113,-0.013866,-0.011102,-0.078374,0.079824,0.055255,-0.491643,-0.309602,0.432001,-0.096631,-0.125716,0.054423,-0.386304,-0.113567,...,-0.154981,-0.515644,0.231666,0.034381,-0.497508,-0.145881,0.13637,0.248001,0.540396,-0.11463,0.09068,-0.136256,0.055671,0.164463,0.173252,-0.164186,-0.246136,-0.1444,-0.610616,-0.029822,0.003193,0.102735,-0.091496,-0.453307,-1.5145,-0.336089,-0.158246,-0.463973,-0.264564,0.159924,0.389236,0.325973,0.20401,-0.464381,0.705045,0.125383,-0.202484,-0.364667,0.242617,-0.194775




In [34]:
X_train_df.to_csv('X_train_df.csv', index = False)

In [35]:
X_valid_df = pd.DataFrame([i for i in X_valid.values])
X_valid_df.to_csv('X_valid_df.csv', index = False)

In [36]:
Y_train = train[['claim']]
Y_valid = valid[['claim']]

In [37]:
Y_train.to_csv('Y_train.csv', index = False)
Y_valid.to_csv('Y_valid.csv', index = False)

# Data Loading

In [None]:
X_train_df = pd.read_csv('X_train_df.csv') #Extract X_train_df.zip first
Y_train = pd.read_csv('Y_train.csv')
X_valid_df = pd.read_csv('X_valid_df.csv')
Y_valid = pd.read_csv('Y_valid.csv')

# Model Development

In [None]:
lr_clf = LogisticRegression(solver='lbfgs')
lr_clf.fit(X_train_df, Y_train)

In [41]:
pred_train = lr_clf.predict(X_train_df)
pred_valid = lr_clf.predict(X_valid_df)

In [42]:
#Training F1 score (micro)
f1_score(Y_train, pred_train, average='micro')

0.9697732997481109

In [43]:
#Training F1 score (macro)
f1_score(Y_train, pred_train, average='macro')

0.9282894717080981

In [44]:
#Validation F1 score (micro)
f1_score(Y_valid, pred_valid, average='micro')

0.929471032745592

In [45]:
#Validation F1 score (macro)
f1_score(Y_valid, pred_valid, average='macro')

0.8222564918913541

In [47]:
pickle.dump(lr_clf, open("lr_clf_FiNCAT.pickle", "wb"))