In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *
from sklearn.model_selection import train_test_split
import tensorflow as tf
!pip install transformers


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Existing news dataset test

In [2]:
finance_news = pd.read_csv('/content/all-data.csv', encoding = "ISO-8859-1")
finance_news['category'] = finance_news['category'].map({'negative': 2, 'neutral': 0, 'positive':1})
finance_news = finance_news[['category','text']]
finance_news.head()

Unnamed: 0,category,text
0,0,"According to Gran , the company has no plans t..."
1,0,Technopolis plans to develop in stages an area...
2,2,The international electronic industry company ...
3,1,With the new production plant the company woul...
4,1,According to the company 's updated strategy f...


In [3]:
X = finance_news['text'].to_list()
y = finance_news['category'].to_list()
labels = {0:0, 1:1, 2:2}

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [5]:
sent_val = list()
for x in X:
    inputs = tokenizer(x, return_tensors="pt", padding=True)
    outputs = model(**inputs)[0]
   
    val = labels[np.argmax(outputs.detach().numpy())]
    sent_val.append(val)

In [6]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [7]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y, sent_val))

0.0660338423442014


In [8]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

In [9]:
sent_val = list()
for x in X:
    inputs = tokenizer(x, return_tensors="pt", padding=True)
    outputs = finbert(**inputs)[0]
   
    val = labels[np.argmax(outputs.detach().numpy())]
    sent_val.append(val)


In [10]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y, sent_val))

0.7926124638877424


# sentiment via LSTM

In [11]:
df = pd.read_csv('/content/all-data.csv', encoding = "ISO-8859-1")
df['category'] = df['category'].map({'negative': -1.0, 'neutral': 0.0, 'positive':1.0})
df = df[['category','text']]
df.head()

Unnamed: 0,category,text
0,0.0,"According to Gran , the company has no plans t..."
1,0.0,Technopolis plans to develop in stages an area...
2,-1.0,The international electronic industry company ...
3,1.0,With the new production plant the company woul...
4,1.0,According to the company 's updated strategy f...


0 Neutral Sentiment
1 Postive Sentiment
-1 Negative Sentiment

In [12]:
def tweet_to_words(tweet):
    text = tweet.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    words = text.split()
    words = [w for w in words if w not in stopwords.words("english")]
    words = [PorterStemmer().stem(w) for w in words]
    return words

In [13]:
X = list(map(tweet_to_words, df['text']))
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(df['category'])

In [14]:
y = pd.get_dummies(df['category'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=1)
#print('Train Set ->', X_train.shape, y_train.shape)
#print('Validation Set ->', X_val.shape, y_val.shape)
#print('Test Set ->', X_test.shape, y_test.shape)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

vocabulary_size = 5000

count_vector = CountVectorizer(max_features=vocabulary_size,
                                preprocessor=lambda x: x,
                               tokenizer=lambda x: x) 

X_train = count_vector.fit_transform(X_train).toarray()

X_test = count_vector.transform(X_test).toarray()


In [16]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_words = 5000
max_len=50

def tokenize_pad_sequences(text):
    '''
    This function tokenize the input text into sequnences of intergers and then
    pad each sequence to the same length
    '''
    # Text tokenization
    tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
    tokenizer.fit_on_texts(text)
    # Transforms text to a sequence of integers
    X = tokenizer.texts_to_sequences(text)
    # Pad sequences to the same length
    X = pad_sequences(X, padding='post', maxlen=max_len)
    # return sequences
    return X, tokenizer

print('Before Tokenization & Padding \n', df['text'][0])
X, tokenizer = tokenize_pad_sequences(df['text'])
print('After Tokenization & Padding \n', X[0])


Before Tokenization & Padding 
 According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
After Tokenization & Padding 
 [  94    5 3498    1   11   16  250  336    5  655  124   88    5  150
 2796   29   10  424    1   11   10  747    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]


In [17]:
y = pd.get_dummies(df['category'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.05, random_state=1)
print('Train Set ->', X_train.shape, y_train.shape)
print('Validation Set ->', X_val.shape, y_val.shape)
print('Test Set ->', X_test.shape, y_test.shape)

Train Set -> (4372, 50) (4372, 3)
Validation Set -> (231, 50) (231, 3)
Test Set -> (243, 50) (243, 3)


In [18]:
import keras.backend as K

def f1_score(precision, recall):
    ''' Function to calculate f1 score '''
    
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [19]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from keras.metrics import Precision, Recall
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import RMSprop
from keras import datasets

from keras.callbacks import LearningRateScheduler
from keras.callbacks import History

from keras import losses

vocab_size = 5000
embedding_size = 32
epochs=20
learning_rate = 0.1
decay_rate = learning_rate / epochs
momentum = 0.8

sgd = SGD(lr=learning_rate, momentum=momentum, decay=decay_rate, nesterov=False)
# Build model
model= Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=max_len))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.4))
model.add(Dense(3, activation='softmax'))

  super(SGD, self).__init__(name, **kwargs)


In [20]:
print(model.summary())

# Compile model
model.compile(loss='categorical_crossentropy', optimizer=sgd, 
               metrics=['accuracy', Precision(), Recall()])

# Train model

batch_size = 64
history = model.fit(X_train, y_train,
                      validation_data=(X_val, y_val),
                      batch_size=batch_size, epochs=epochs, verbose=1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 32)            160000    
                                                                 
 conv1d (Conv1D)             (None, 50, 32)            3104      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 25, 32)           0         
 )                                                               
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 3)                 1

In [21]:
loss, accuracy, precision, recall = model.evaluate(X_test, y_test, verbose=0)
# Print metrics
print('')
print('Accuracy  : {:.4f}'.format(accuracy))
print('Precision : {:.4f}'.format(precision))
print('Recall    : {:.4f}'.format(recall))
print('F1 Score  : {:.4f}'.format(f1_score(precision, recall)))


Accuracy  : 0.7119
Precision : 0.7348
Recall    : 0.6955
F1 Score  : 0.7146


In [22]:
model.save('best_model.h5')
print('Best model saved')

Best model saved


In [23]:
from keras.models import load_model

# Load model
model = load_model('best_model.h5')

def predict_class(text):
    '''Function to predict sentiment class of the passed text'''
    
    sentiment_classes = ['Negative', 'Neutral', 'Positive']
    max_len=50
    
    # Transforms text to a sequence of integers using a tokenizer object
    xt = tokenizer.texts_to_sequences(text)
    # Pad sequences to the same length
    xt = pad_sequences(xt, padding='post', maxlen=max_len)
    # Do the prediction using the loaded model
    yt = model.predict(xt).argmax(axis=1)
    # Print the predicted sentiment
    print('The predicted sentiment is', sentiment_classes[yt[0]])


# current news evaluation via best performing finbert-tone

In [24]:
# import requests as rq

# base_uri = "https://cryptonews-api.com/api/v1/category?section=general&date=12092021-02072022&items=50&page=1&extra-fields=rankscore&token=k2tapopvxmvzfhddebjmgjalrahfapyvz60qwtii"

# generated_uris = [f"{base_uri}{pnum}" for pnum in range(1, 11)]
# generated_uris

In [25]:
# import urllib.request, json 
# with urllib.request.urlopen("https://cryptonews-api.com/api/v1/category?section=general&date=12092021-02072022&items=50&page=1&extra-fields=rankscore&token=k2tapopvxmvzfhddebjmgjalrahfapyvz60qwtii") as url:
#     data = json.loads(url.read().decode())
#     print(data)

# for element in data:
#   if 'total_pages' in element:
#     del data['total_pages']
#     break

# rows = []
  
# # appending rows
# for d in data:
#     data_row = d['news_url']
#     time = data['image_url']
      
#     for row in data_row:
#         row['Name']= time
#         rows.append(row)
  
# # using data frame
# df = pd.DataFrame(rows)

# #df = pd.DataFrame.from_dict(data, orient="index")
# df

In [26]:
data = pd.read_json("https://cryptonews-api.com/api/v1/category?section=general&date=12092021-02072022&items=50&extra-fields=rankscore&token=k2tapopvxmvzfhddebjmgjalrahfapyvz60qwtii", orient="split")

In [27]:

# k=1
# for i in range(2,80):
#   link = link.replace("&page=k, &page=i)
#   addit = pd.read_json("https://cryptonews-api.com/api/v1/category?section=general&date=12092021-02072022&items=50&extra-fields=rankscore&token=k2tapopvxmvzfhddebjmgjalrahfapyvz60qwtii", orient="split")
#   data.append(addit)

In [28]:
data['sentiment'] = data['sentiment'].map({'Negative': 2, 'Neutral': 0, 'Positive':1})

In [30]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

In [31]:
X = data['title'].to_list()

sent_val = list()
for x in X:
    inputs = tokenizer(x, return_tensors="pt", padding=True)
    outputs = finbert(**inputs)[0]
   
    val = labels[np.argmax(outputs.detach().numpy())]
    sent_val.append(val)
data['sentimenttitle'] = sent_val

In [32]:
X = data['text'].to_list()

sent_val = list()
for x in X:
    inputs = tokenizer(x, return_tensors="pt", padding=True)
    outputs = finbert(**inputs)[0]
   
    val = labels[np.argmax(outputs.detach().numpy())]
    sent_val.append(val)

data['sentimenttext'] = sent_val

In [33]:
data

Unnamed: 0,news_url,image_url,title,text,source_name,date,topics,sentiment,type,rank_score,sentimenttitle,sentimenttext
0,https://blockonomi.com/koinly-review/,https://crypto.snapi.dev/images/v1/w/v/koinly-...,Koinly Review: Cryptocurrency Tax Software for...,Koinly is an online crypto tax platform that a...,Blockonomi,2022-02-07 23:51:18-05:00,[taxes],0,Article,3.83,0,0
1,https://dailyhodl.com/2022/02/07/long-time-blo...,https://crypto.snapi.dev/images/v1/7/m/imf-cbd...,Long-Time Blockchain Advocate Congressman Tom ...,"Minnesota Congressman Tom Emmer, former co-cha...",The Daily Hodl,2022-02-07 23:40:38-05:00,[regulations],0,Article,4.45,0,0
2,https://cointelegraph.com/news/kazakhstan-prop...,https://crypto.snapi.dev/images/v1/8/4/840-ahr...,Kazakhstan proposes power price hikes and taxe...,The central Asian country's government has pro...,Cointelegraph,2022-02-07 23:26:40-05:00,"[mining, taxes]",2,Article,4.71,0,0
3,https://news.bitcoin.com/us-senator-chinas-dig...,https://crypto.snapi.dev/images/v1/u/2/us-chin...,US Senator: China's Digital Currency Could Sub...,A U.S. senator has warned about China's centra...,Bitcoin,2022-02-07 22:00:30-05:00,[regulations],0,Article,4.54,1,2
4,https://cointelegraph.com/news/latest-defi-bri...,https://crypto.snapi.dev/images/v1/8/4/840-ahr...,Latest DeFi bridge exploit results in $4.4M lo...,Another token bridge suffered a malicious atta...,Cointelegraph,2022-02-07 21:55:30-05:00,[],2,Article,4.44,2,2
5,https://cryptonews.com/news/chinas-winter-olym...,https://crypto.snapi.dev/images/v1/a/c/webp-ne...,China's Winter Olympic Digital Yuan Gets Frost...,China is continuing in its efforts to wow the ...,Cryptonews,2022-02-07 20:00:00-05:00,[digital yuan],0,Article,3.85,0,2
6,https://cryptoslate.com/no-amount-of-regulatio...,https://crypto.snapi.dev/images/v1/r/e/regulat...,No amount of regulation can make up for crypto...,As the crypto crowd ponders on what the coming...,CryptoSlate,2022-02-07 19:01:18-05:00,[regulations],0,Article,4.28,0,0
7,https://bitcoinist.com/jpmorgan-strategist-cry...,https://crypto.snapi.dev/images/v1/c/r/cryptoc...,JPMorgan Strategist: Crypto Is Like Hydrogen,"JPMorgan's chairman of investment strategy, Mi...",Bitcoinist,2022-02-07 19:00:18-05:00,[],2,Article,3.84,0,0
8,https://bitcoinist.com/4-exciting-defi-project...,https://crypto.snapi.dev/images/v1/p/i/picture...,4 Exciting DeFi Projects Worth Watching In 2022,With more than $221 billion in total value loc...,Bitcoinist,2022-02-07 18:58:55-05:00,[],1,Article,3.84,1,1
9,https://cryptopotato.com/mitsubishi-ufj-trust-...,https://crypto.snapi.dev/images/v1/o/b/mitsubi...,Mitsubishi UFJ Trust to Issue a Stablecoin to ...,The Tokyo-based Mitsubishi UFJ Trust wants to ...,CryptoPotato,2022-02-07 18:48:34-05:00,[stablecoins],0,Article,3.33,0,1
