In [62]:
import os
from os import listdir
from os.path import isfile, join
import copy
import numpy as np
import pandas as pd
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertConfig
from bertModel import BertClassification, dense_opt
#from datasets import text_dataset, financialPhraseBankDataset
import argparse
from sklearn.metrics import f1_score
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/claire/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [55]:
labels = {0:'neutral', 1:'positive',2:'negative'}
num_labels= len(labels)
vocab = "finance-uncased"
vocab_path = 'analyst_tone/vocab'
pretrained_weights_path = "analyst_tone/pretrained_weights" # this is pre-trained FinBERT weights
fine_tuned_weight_path = "analyst_tone/fine_tuned.pth"      # this is fine-tuned FinBERT weights
max_seq_length=256
device='gpu'

In [56]:
model = BertClassification(weight_path= pretrained_weights_path, num_labels=num_labels, vocab=vocab)
tokenizer = BertTokenizer(vocab_file = vocab_path, do_lower_case = True, do_basic_tokenize = True)

  nn.init.xavier_normal(self.classifier.weight)


In [57]:
model.load_state_dict(torch.load(fine_tuned_weight_path, map_location=torch.device(device)))

RuntimeError: Expected one of cpu, cuda, mkldnn, opengl, opencl, ideep, hip, msnpu, xla device type at start of device string: gpu

In [None]:
tokenizer = BertTokenizer(vocab_file = vocab_path, do_lower_case = True, do_basic_tokenize = True)

In [6]:
def finbert(sent):
    tokenized_sent = tokenizer.tokenize(sent)
    if len(tokenized_sent) > max_seq_length:
        tokenized_sent = tokenized_sent[:max_seq_length]
    
    ids_review  = tokenizer.convert_tokens_to_ids(tokenized_sent)
    mask_input = [1]*len(ids_review)        
    padding = [0] * (max_seq_length - len(ids_review))
    ids_review += padding
    mask_input += padding
    input_type = [0]*max_seq_length
    
    input_ids = torch.tensor(ids_review).to(device).reshape(-1, 256)
    attention_mask =  torch.tensor(mask_input).to(device).reshape(-1, 256)
    token_type_ids = torch.tensor(input_type).to(device).reshape(-1, 256)
    
    with torch.set_grad_enabled(False):
        outputs = model(input_ids, token_type_ids, attention_mask)
        outputs = F.softmax(outputs,dim=1)
        return labels[torch.argmax(outputs).item()]

## Ticker and cik table

In [7]:
ticker_library = pd.read_csv('SHARADAR_TICKERS_6cc728d11002ab9cb99aa8654a6b9f4e.csv')
ticker_selected = pd.read_csv('SP500_component_stocks.csv',header = None)
ticker_selected.columns = ['name','ticker']

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [8]:
def ticker_cik(file):
    return str(file)[-10:]

In [9]:
ticker_library['cik'] = ticker_library['secfilings'].apply(ticker_cik)

In [10]:
ticker_cik_df = ticker_library[['ticker', 'cik']]

In [11]:
ticker_cik_df.head()

Unnamed: 0,ticker,cik
0,A,1090872
1,AA,1675149
2,AAAB,1066808
3,AAAGY,1182802
4,AAAP,1611787


## Read 10-K and 10-Q files

In [63]:
#store data using dictionary
all_data = {}
tenK = []
tenQ = []
tenKdate = []
tenQdate = []
tenKCik = []
tenQCik = []
tenKsector = []
tenQsector = []

In [64]:
base_path = 'Correct Scrapping Data'

In [65]:
industries = os.listdir(base_path)[1:]

In [66]:
industries

['Financials', 'Health', 'IT', 'Communication', 'ConsumerDiscretionary']

In [67]:
for industry in industries:
    files = os.listdir(base_path + "/" + industry + '/10_k')
    if '.DS_Store' in files:
        files.remove('.DS_Store')
    
    for file in files:
        path_10k = os.listdir(base_path + '/' + industry + '/10_k' + "/" + file + '/grabbed_text')
        #print(path_10k)
        if '.DS_Store' in path_10k:
            path_10k.remove('.DS_Store')
        for report in path_10k:
            f = open(base_path + '/' + industry + '/10_k' + "/" + file + '/grabbed_text/' + str(report), "r")
            text = f.read().lower()
            date = report[-14:-4]
            cik = report[:10]
            tenK.append(text)
            tenKdate.append(date)
            tenKCik.append(cik)
            tenKsector.append(industry)
        
        path_10q = os.listdir(base_path + '/' + industry + '/10_q' + "/" + file + '/grabbed_text')
        if '.DS_Store' in path_10q:
            path_10q.remove('.DS_Store')
        for report in path_10q:
            f = open(base_path + '/' + industry + '/10_q' + "/" + file + '/grabbed_text/' + str(report), "r")
            text = f.read().lower()
            date = report[-14:-4]
            cik = report[:10]
            tenQ.append(text)
            tenQdate.append(date)
            tenQCik.append(cik)
            tenQsector.append(industry)

In [69]:
l1 = len(tenK)
l2 = len(tenQ)

In [71]:
l1 + l2

1892

In [72]:
all_sentences_10k = pd.DataFrame(columns = ['sentence', 'Date', 'Cik'])
for i in range(l1):
    sentences = pd.DataFrame(nltk.tokenize.sent_tokenize(tenK[i]), columns = ['sentence'])
    sentences['Date'] = tenKdate[i]
    sentences['Cik'] = tenKCik[i]
    sentences['Sector'] = tenKsector[i]
    #print(i)
    sentences['Company'] = list(ticker_cik_df[ticker_cik_df['cik'] == tenKCik[i]].ticker)[0]
    all_sentences_10k = pd.concat([all_sentences_10k, sentences])

In [73]:
all_sentences_10k['File'] = '10-K'

In [74]:
all_sentences_10k

Unnamed: 0,sentence,Date,Cik,Sector,Company,File
0,item 7. management's discussion and analysis o...,2018-02-09,0001126328,Financials,PFG,10-K
1,the discussion should be read in conjunction w...,2018-02-09,0001126328,Financials,PFG,10-K
2,"forward-looking statements include, but are no...",2018-02-09,0001126328,Financials,PFG,10-K
3,forward-looking statements are made based upon...,2018-02-09,0001126328,Financials,PFG,10-K
4,such forward-looking statements are not guaran...,2018-02-09,0001126328,Financials,PFG,10-K
...,...,...,...,...,...,...
22,see effect of exchange rates for additional ...,2008-02-11,0001018724,ConsumerDiscretionary,AMZN,10-K
23,"investment risk as of december 31, 2007, our r...",2008-02-11,0001018724,ConsumerDiscretionary,AMZN,10-K
24,the fair values of our investments are subject...,2008-02-11,0001018724,ConsumerDiscretionary,AMZN,10-K
25,based on the fair value of the publicly-traded...,2008-02-11,0001018724,ConsumerDiscretionary,AMZN,10-K


In [75]:
all_sentences_10q = pd.DataFrame(columns = ['sentence', 'Date', 'Cik'])
for i in range(l2):
    sentences = pd.DataFrame(nltk.tokenize.sent_tokenize(tenQ[i]), columns = ['sentence'])
    sentences['Date'] = tenQdate[i]
    sentences['Cik'] = tenQCik[i]
    sentences['Sector'] = tenQsector[i]
    sentences['Company'] = list(ticker_cik_df[ticker_cik_df['cik'] == tenQCik[i]].ticker)[0]
    all_sentences_10q = pd.concat([all_sentences_10q, sentences])

In [76]:
all_sentences_10q['File'] = '10-Q'
all_sentences_10q

Unnamed: 0,sentence,Date,Cik,Sector,Company,File
0,discussion and analysis of financial condition...,2017-08-02,0001126328,Financials,PFG,10-Q
1,this estimate excludes the impact of any poten...,2017-08-02,0001126328,Financials,PFG,10-Q
2,the selection of a 10% unfavorable change in t...,2017-08-02,0001126328,Financials,PFG,10-Q
3,our exposure will change as a result of change...,2017-08-02,0001126328,Financials,PFG,10-Q
4,use of derivatives to manage equity risk.,2017-08-02,0001126328,Financials,PFG,10-Q
...,...,...,...,...,...,...
17,based on the outstanding 6.875% peacs princip...,2004-07-23,0001018724,ConsumerDiscretionary,AMZN,10-Q
18,assuming the u.s. dollar weakens against the e...,2004-07-23,0001018724,ConsumerDiscretionary,AMZN,10-Q
19,"investment risk as of june 30, 2004, our recor...",2004-07-23,0001018724,ConsumerDiscretionary,AMZN,10-Q
20,the fair values of our investments are subject...,2004-07-23,0001018724,ConsumerDiscretionary,AMZN,10-Q


In [77]:
all_sentences = pd.concat([all_sentences_10k, all_sentences_10q])

In [25]:
all_sentences

Unnamed: 0,sentence,Date,Cik,Sector,Company,File
0,item 7. management's discussion and analysis o...,2018-02-09,0001126328,Financials,PFG,10-K
1,the discussion should be read in conjunction w...,2018-02-09,0001126328,Financials,PFG,10-K
2,"forward-looking statements include, but are no...",2018-02-09,0001126328,Financials,PFG,10-K
3,forward-looking statements are made based upon...,2018-02-09,0001126328,Financials,PFG,10-K
4,such forward-looking statements are not guaran...,2018-02-09,0001126328,Financials,PFG,10-K
...,...,...,...,...,...,...
17,based on the outstanding 6.875% peacs princip...,2004-07-23,0001018724,ConsumerDiscretionary,AMZN,10-Q
18,assuming the u.s. dollar weakens against the e...,2004-07-23,0001018724,ConsumerDiscretionary,AMZN,10-Q
19,"investment risk as of june 30, 2004, our recor...",2004-07-23,0001018724,ConsumerDiscretionary,AMZN,10-Q
20,the fair values of our investments are subject...,2004-07-23,0001018724,ConsumerDiscretionary,AMZN,10-Q


In [79]:
all_sentences['Sector'].unique()

array(['Financials', 'Health', 'IT', 'Communication',
       'ConsumerDiscretionary'], dtype=object)

In [81]:
all_sentences['Company'].nunique()

48

In [89]:
all_sentences[all_sentences['Sector'] == 'IT']['Company'].unique()

array(['AMD', 'LRCX', 'NLOK', 'PAYC', 'CTSH', 'ADS', 'AAPL', 'INTC',
       'MSFT', 'WU'], dtype=object)

In [58]:
all_sentences.to_csv("Sentences_all.csv", index = False)

In [47]:
TWTR_pivot.to_csv('TWTR.csv')

In [53]:
TWTR.to_csv('TWTR_sentence.csv')