<a href="https://colab.research.google.com/github/pratik-poudel/financial_news_stock/blob/master/sentiment%20analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import Model
from keras.models import Sequential, Input
from keras.layers import Dense, Dropout, Embedding, Bidirectional, LSTM, GRU


from sklearn.feature_extraction import text

import warnings 
warnings.filterwarnings('ignore')

# Trying RNN

In [2]:
df = pd.read_csv('years_combined_df.csv')

In [3]:
df.head()

Unnamed: 0,title,content,author,date
0,New gadgets lined up for New Year 2014,"With the year 2013 making shift to 2014, sever...",SUMAN BASHYAL,2014-01-01
1,Chinese tourists take great leap forward,Chinese tourists made up 50 percent of the thr...,Post Report,2014-01-01
2,Pokhara tourism expects 2014 to herald new er...,Pokhara tourism is getting ready for 2014 with...,"SANGAM PRASAIN, Shiva Sharma",2014-01-01
3,"CAs, auditors condemn Agrawal duo's arrest",The Association of Chartered Accountants of Ne...,Post Report,2014-01-01
4,Don’t count your chickens,The benefits to Nepal from the WTO Bali Summit...,BIJENDRA MAN SHAKYA,2014-01-01


In [4]:
sent_list = []
for sent in df['content']:
    sent_list.append(sent)

In [5]:
all_words = ' '.join(sent_list).split(' ')
len(all_words)

286085

In [6]:
unique_words = list(set(all_words))
len(unique_words)

24785

In [7]:
unique_words.append('<UKN/>')

In [8]:
index_to_word = {i:wd for i, wd in enumerate(sorted(unique_words))}

In [9]:
word_to_index = {wd:i for i, wd in enumerate(sorted(unique_words))}

In [10]:
word_to_index.get('<UKN/>')

3018

In [11]:
sent_str = ' '.join(sent_list)

In [12]:
sentences =[] # training data
next_chars = [] # training labels

step = 2 # step to take while reading eg With the yea --> with the y: netxt char:r and after step 2 will begin form th (wi th)
chars_window =10 # Least no of characters to use for prediction

for i in range(0, len(sent_str) - chars_window, step):
    sentences.append(sent_str[i: i+chars_window])
    next_chars.append(sent_str[i + chars_window])

In [13]:
for sent, char in zip(sentences[:10], next_chars[:10]):
    print("Sentence : {} --> Next Character: {}".format(sent, char))

Sentence : With the y --> Next Character: e
Sentence : th the yea --> Next Character: r
Sentence :  the year  --> Next Character: 2
Sentence : he year 20 --> Next Character: 1
Sentence :  year 2013 --> Next Character:  
Sentence : ear 2013 m --> Next Character: a
Sentence : r 2013 mak --> Next Character: i
Sentence : 2013 makin --> Next Character: g
Sentence : 13 making  --> Next Character: s
Sentence :  making sh --> Next Character: i


In [16]:
new_text = ['A man either lives life as it happens to him meets it head-on and licks it or he turns his back on it and starts to wither away',
 'To the brave crew and passengers of the Kobayshi Maru sucks to be you',
 'Beware of more powerful weapons They often inflict as much damage to your soul as they do to you enemies',
 'They are merely scars not mortal wounds and you must use them to propel you forward',
 'You cannot explain away a wantonly immoral act because you think that it is connected to some higher purpose']

In [None]:
new_text_split = []
for sentence in new_text:
    sent_split = []
    for word in sentence.split(' '):
        index = word_to_index.get(word, word_to_index.get('<UKN/>'))
        sent_split.append(index)
        print(sent_split)
    new_text_split.append(sent_split)
    print(new_text_split)

In [18]:
print(' '.join([index_to_word[index] for index in new_text_split[2]]))

<UKN/> of more powerful <UKN/> They often <UKN/> as much damage to your <UKN/> as they do to you <UKN/>


In [19]:
# Instantiate the class
model = Sequential(name='sequential_model')

# One LSTM layer (defining the input shape because it is the 
# initial layer)
model.add(LSTM(128, input_shape=(None, 10), name="LSTM"))

# Add a dense layer with one unit
model.add(Dense(1, activation="sigmoid", name="output"))

# The summary shows the layers and the number of parameters 
# that will be trained
model.summary()

Model: "sequential_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
LSTM (LSTM)                  (None, 128)               71168     
_________________________________________________________________
output (Dense)               (None, 1)                 129       
Total params: 71,297
Trainable params: 71,297
Non-trainable params: 0
_________________________________________________________________


In [20]:
# Define the input layer
main_input = Input(shape=(None, 10), name="input")

# One LSTM layer (input shape is already defined)
lstm_layer = LSTM(128, name="LSTM")(main_input)

# Add a dense layer with one unit
main_output = Dense(1, activation="sigmoid", name="output")(lstm_layer)

# Instantiate the class at the end
model = Model(inputs=main_input, outputs=main_output, name="modelclass_model")

# Same amount of parameters to train as before (71,297)
model.summary()

Model: "modelclass_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, None, 10)]        0         
_________________________________________________________________
LSTM (LSTM)                  (None, 128)               71168     
_________________________________________________________________
output (Dense)               (None, 1)                 129       
Total params: 71,297
Trainable params: 71,297
Non-trainable params: 0
_________________________________________________________________


In [21]:
texts=  ['So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. If it’s unobserved it will, however, if it’s observed after it’s left the plane but before it hits its target, it will not have gone through both slits.',
       'Hello, female children. Allow me to inspire you with a story about a great female scientist. Polish-born, French-educated Madame Curie. Co-discoverer of radioactivity, she was a hero of science, until her hair fell out, her vomit and stool became filled with blood, and she was poisoned to death by her own discovery. With a little hard work, I see no reason why that can’t happen to any of you. Are we done? Can we go?']

In [22]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sent_list)

# Change texts into sequence of indexes
texts_numeric = tokenizer.texts_to_sequences(sent_list)
print("Number of words in the sample texts: ({0}, {1})".format(len(texts_numeric[0]), len(texts_numeric[1])))

# Pad the sequences
texts_pad = pad_sequences(texts_numeric, 60)
print("Now the texts have fixed length: 60. Let's see the first one: \n{0}".format(texts_pad[7357]))

Number of words in the sample texts: (26, 21)
Now the texts have fixed length: 60. Let's see the first one: 
[   10   411  3234    17 13444     3  1001   102    56   166  3319    32
    11  3490    15  2374   393  5615  5112  2744     5     6   421  5615
  1706   502  4402     1  2851 13445 13446    21  1449    15   104  1662
   787   678    11  1108     5  2087     1  4389     2   379     1  1662
   421    14     6  7909     2  3428    31  7406   299    10  1229 13447]


In [23]:
max_len = []
for i in range(len(texts_numeric)):
    max_len.append(len(texts_numeric[i]))

In [24]:
np.argmax(max_len), np.max(max_len)

(7357, 125)

In [25]:
print(texts_numeric[7357][:70])

[1, 316, 1448, 2551, 16, 858, 11, 97, 2, 2334, 4, 8, 144, 64, 176, 2, 1, 1010, 858, 356, 4, 8, 829, 4237, 2772, 3370, 1, 1662, 421, 5405, 6, 1809, 1120, 13441, 14, 1, 5578, 5, 5578, 6114, 1, 13442, 8530, 1, 1662, 421, 7, 6, 286, 243, 4796, 5038, 21, 11, 13443, 7940, 38, 2087, 1, 795, 2, 27, 512, 127, 234, 10, 411, 3234, 17, 13444]


# BAG OF WORDS

In [28]:
vectorizer = text.CountVectorizer()
X = vectorizer.fit_transform(sent_list)

In [38]:
X_df = pd.DataFrame(data=X.toarray(), columns=sorted(vectorizer.vocabulary_.keys()))

In [45]:
X_df[X_df['zone'] > 1]

Unnamed: 0,000,000kg,000mw,003,006,01,014,016,017,02,027,028,029,03,030,034,036,04,040,043,044,045,05,050,053,06,068,069,07,070,072,074,079,08,09,090,092,094,10,100,...,youngest,your,youth,youthful,youths,youthsave,youtube,ysef,ysesef,yu,yuan,yuba,yubaraj,yubin,yubraj,yun,yunnan,yzf,z2,zambia,zealand,zenith,zero,zervos,zest,zhao,zhejiang,zhejiyang,zinc,zoe,zonal,zone,zones,zonta,zoomed,zooming,zs,zsolt,zte,škoda
1948,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0
2479,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0
4857,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0
8370,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0
9240,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0
9872,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0


# TF IDF

In [46]:
vectorizer = text.TfidfVectorizer()
X= vectorizer.fit_transform(sent_list)
X_df = pd.DataFrame(X.toarray(), columns=sorted(vectorizer.vocabulary_.keys()))

In [48]:
X_df[X_df['youth'] > 0]

Unnamed: 0,000,000kg,000mw,003,006,01,014,016,017,02,027,028,029,03,030,034,036,04,040,043,044,045,05,050,053,06,068,069,07,070,072,074,079,08,09,090,092,094,10,100,...,youngest,your,youth,youthful,youths,youthsave,youtube,ysef,ysesef,yu,yuan,yuba,yubaraj,yubin,yubraj,yun,yunnan,yzf,z2,zambia,zealand,zenith,zero,zervos,zest,zhao,zhejiang,zhejiyang,zinc,zoe,zonal,zone,zones,zonta,zoomed,zooming,zs,zsolt,zte,škoda
396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.251002,0.0,0.0,0.0,0.0,0.0,0.296697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
450,0.193507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.286778,0.0,0.0,0.0,0.0,0.0,0.338986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.274935,0.0,0.0,0.0,0.0,0.0,0.324987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.235358,0.0,0.0,0.0,0.0,0.0,0.278205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1068,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.241745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.229067,0.0,0.0,0.0,0.0,0.0,0.270769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1652,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.32599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1684,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.23571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.247758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.294647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# BERT



In [13]:
import torch
torch.__version__

'1.8.0+cu101'

In [5]:
!pip install torch==1.3.1
!pip install transformers==3.5.1



In [6]:
import transformers
transformers.__version__

'3.5.1'

In [7]:
from transformers import BertModel, BertTokenizer
import torch

In [8]:
model = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [10]:
sent_list[2]

'Pokhara tourism is getting ready for 2014 with a positive outlook.'

In [11]:
sentence = sent_list[2]
tokens = tokenizer.tokenize(sentence)
print(tokens)

['po', '##khar', '##a', 'tourism', 'is', 'getting', 'ready', 'for', '2014', 'with', 'a', 'positive', 'outlook', '.']


In [12]:
tokens =['[CLS]'] + tokens + ['[SEP]']
print(tokens)

['[CLS]', 'po', '##khar', '##a', 'tourism', 'is', 'getting', 'ready', 'for', '2014', 'with', 'a', 'positive', 'outlook', '.', '[SEP]']


In [13]:
tokens = tokens + ['[PAD]'] + ['[PAD]']
print(tokens)

['[CLS]', 'po', '##khar', '##a', 'tourism', 'is', 'getting', 'ready', 'for', '2014', 'with', 'a', 'positive', 'outlook', '.', '[SEP]', '[PAD]', '[PAD]']


In [14]:
attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens]
print(attention_mask)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]


In [15]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

[101, 13433, 22510, 2050, 6813, 2003, 2893, 3201, 2005, 2297, 2007, 1037, 3893, 17680, 1012, 102, 0, 0]


In [16]:
token_ids = torch.tensor(token_ids).unsqueeze(0)
attention_mask = torch.tensor(attention_mask).unsqueeze(0)
print(token_ids)
print(attention_mask)

tensor([[  101, 13433, 22510,  2050,  6813,  2003,  2893,  3201,  2005,  2297,
          2007,  1037,  3893, 17680,  1012,   102,     0,     0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])


In [17]:
hidden_rep, cls_head = model(token_ids, attention_mask)

In [18]:
hidden_rep.shape # [batch_size, sequence_length, hidden_size]

torch.Size([1, 18, 768])

In [21]:
hidden_rep

tensor([[-0.1040, -0.1907, -0.0131,  ..., -0.7095,  0.7275, -0.0724],
        [-0.1738, -0.8408, -1.1933,  ..., -0.8724,  1.3197,  0.5905],
        [ 0.8324, -1.1250, -1.3569,  ..., -1.0257,  0.2419, -0.9045],
        ...,
        [ 0.4273,  0.0827,  0.1712,  ..., -0.2997, -0.2481, -0.4829],
        [-0.0073, -0.1230,  0.2241,  ..., -0.2157,  0.3169, -0.2907],
        [-0.0890, -0.3296,  0.1677,  ..., -0.2695,  0.3403, -0.2666]],
       grad_fn=<SelectBackward>)

In [23]:
cls_head.shape #[batch_size, hidden_size]

torch.Size([1, 768])

## Extracting the embeddings

In [40]:
del token_ids, attention_mask

In [41]:
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [42]:
sentence = sent_list[3]
print(sentence)

The Association of Chartered Accountants of Nepal (ACAN) and the Nepal Auditors' Association (AUDAN) have condemned the arrest of chartered accountants


In [43]:
tokens = tokenizer.tokenize(sentence)
print(tokens)
tokens = ["[CLS]"] + tokens + ["[SEP]" ]
print(tokens)

tokens = tokens + ['[PAD]'] + ['[PAD]']
print(tokens)

['the', 'association', 'of', 'chartered', 'accountants', 'of', 'nepal', '(', 'ac', '##an', ')', 'and', 'the', 'nepal', 'auditor', '##s', "'", 'association', '(', 'au', '##dan', ')', 'have', 'condemned', 'the', 'arrest', 'of', 'chartered', 'accountants']
['[CLS]', 'the', 'association', 'of', 'chartered', 'accountants', 'of', 'nepal', '(', 'ac', '##an', ')', 'and', 'the', 'nepal', 'auditor', '##s', "'", 'association', '(', 'au', '##dan', ')', 'have', 'condemned', 'the', 'arrest', 'of', 'chartered', 'accountants', '[SEP]']
['[CLS]', 'the', 'association', 'of', 'chartered', 'accountants', 'of', 'nepal', '(', 'ac', '##an', ')', 'and', 'the', 'nepal', 'auditor', '##s', "'", 'association', '(', 'au', '##dan', ')', 'have', 'condemned', 'the', 'arrest', 'of', 'chartered', 'accountants', '[SEP]', '[PAD]', '[PAD]']


In [47]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [48]:
attention_mask = [1 if i != "[PAD]" else 0 for i in tokens]
print(attention_mask)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]


In [49]:
token_ids = torch.tensor(token_ids).unsqueeze(0)
attention_mask = torch.tensor(attention_mask).unsqueeze(0)

In [50]:
print(token_ids)

tensor([[  101,  1996,  2523,  1997, 12443, 29114,  1997,  8222,  1006,  9353,
          2319,  1007,  1998,  1996,  8222, 20964,  2015,  1005,  2523,  1006,
          8740,  7847,  1007,  2031, 10033,  1996,  6545,  1997, 12443, 29114,
           102,     0,     0]])


In [51]:
last_hidden_state, pooler_output, hidden_states = model(token_ids, attention_mask = attention_mask)

In [53]:
last_hidden_state.shape, pooler_output.shape

(torch.Size([1, 33, 768]), torch.Size([1, 768]))

In [55]:
len(hidden_states)

13

In [None]:
!pip install nlp==0.4.0

In [57]:
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer , TrainingArguments
from nlp import load_dataset
import torch 
import numpy as np

In [None]:
!gdown https://drive.google.com/uc?id=11_M4ootuT7I1G0RlihcC0cA3Elqotlc-
dataset = load_dataset('csv', data_files='./imdbs.csv', split='train')


In [60]:
dataset = dataset.train_test_split(test_size=0.3)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [62]:
train_set = dataset['train']
test_set = dataset['test']


In [69]:
type(train_set) # train_set['text] --> list, train_set['labels] --> list

nlp.arrow_dataset.Dataset

In [70]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [71]:
def preprocess(data):
    return tokenizer(data['text'], padding=True, truncation=True)


In [74]:
train_set = train_set.map(preprocess, batched=True, batch_size=len(train_set))
test_set = test_set.map(preprocess, batched=True, batch_size=len(test_set))


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [89]:
train_set['text'][0]

"Average (and surprisingly tame) Fulci giallo which means it's still quite bad by normal standards, but redeemed by its solid build-up and some nice touches such as a neat time twist on the issues of visions and clairvoyance.<br /><br />The genre's well-known weaknesses are in full gear: banal dialogue, wooden acting, illogical plot points. And the finale goes on much too long, while the denouement proves to be a rather lame or shall I say: limp affair.<br /><br />Fulci's ironic handling of giallo norms is amusing, though. Yellow clues wherever you look.<br /><br />3 out of 10 limping killers"

In [76]:
train_set.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_set.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])


In [91]:
train_set['input_ids']

array([array([  101,  2779,  1006,  1998, 10889, 24763,  1007, 11865, 15472,
        2072, 27699,  7174,  2029,  2965,  2009,  1005,  1055,  2145,
        3243,  2919,  2011,  3671,  4781,  1010,  2021,  2417, 21564,
        2098,  2011,  2049,  5024,  3857,  1011,  2039,  1998,  2070,
        3835, 12817,  2107,  2004,  1037, 15708,  2051,  9792,  2006,
        1996,  3314,  1997, 12018,  1998, 17936,  6767,  7054,  3401,
        1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,  1028,
        1996,  6907,  1005,  1055,  2092,  1011,  2124, 21775,  2024,
        1999,  2440,  6718,  1024,  7221,  2389,  7982,  1010,  4799,
        3772,  1010,  5665, 20734,  5436,  2685,  1012,  1998,  1996,
        9599,  3632,  2006,  2172,  2205,  2146,  1010,  2096,  1996,
        7939, 27872,  3672, 16481,  2000,  2022,  1037,  2738, 20342,
        2030,  4618,  1045,  2360,  1024, 14401,  6771,  1012,  1026,
        7987,  1013,  1028,  1026,  7987,  1013,  1028, 11865, 15472,
        2072,

In [78]:
batch_size = 8
epochs = 5
warmup_steps = 500
weight_decay = 0.01

In [82]:
training_args = TrainingArguments(
    output_dir = './results', 
    num_train_epochs = epochs,
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size,
    warmup_steps = warmup_steps, 
    weight_decay= weight_decay, 
    evaluate_during_training = True,
    logging_dir = './logs'
)

In [83]:
trainer = Trainer(model=model, args = training_args, train_dataset=train_set, eval_dataset=test_set)

In [86]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=45, training_loss=0.6659500969780816)

In [87]:
trainer.evaluate()

{'epoch': 5.0, 'eval_loss': 0.6765727400779724}

In [None]:
trainer.

## Token Embeddings

## Segment Embedding

## Positional Embeddings