In [6]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from keras.metrics import Precision, Recall
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import RMSprop
from keras import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score
import warnings
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
from keras.callbacks import LearningRateScheduler
from keras.callbacks import History
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import losses
import keras.backend as K
import requests as rq
import urllib.request, json 
from sklearn.preprocessing import LabelEncoder

!pip install transformers


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Existing news dataset test

In [7]:
finance_news = pd.read_csv('/content/all-data.csv', encoding = "ISO-8859-1")
finance_news['category'] = finance_news['category'].map({'negative': 2, 'neutral': 0, 'positive':1})
finance_news = finance_news[['category','text']]
finance_news.head()

Unnamed: 0,category,text
0,0,"According to Gran , the company has no plans t..."
1,0,Technopolis plans to develop in stages an area...
2,2,The international electronic industry company ...
3,1,With the new production plant the company woul...
4,1,According to the company 's updated strategy f...


In [8]:
X = finance_news['text'].to_list()
y = finance_news['category'].to_list()
labels = {0:0, 1:1, 2:2}

In [9]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [10]:
sent_val = list()
for x in X:
    inputs = tokenizer(x, return_tensors="pt", padding=True)
    outputs = model(**inputs)[0]
   
    val = labels[np.argmax(outputs.detach().numpy())]
    sent_val.append(val)

In [11]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [12]:
print(accuracy_score(y, sent_val))

0.0660338423442014


In [13]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

Downloading:   0%|          | 0.00/533 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/419M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/221k [00:00<?, ?B/s]

In [14]:
sent_val = list()
for x in X:
    inputs = tokenizer(x, return_tensors="pt", padding=True)
    outputs = finbert(**inputs)[0]
   
    val = labels[np.argmax(outputs.detach().numpy())]
    sent_val.append(val)


In [15]:
print(accuracy_score(y, sent_val))

0.7926124638877424


# sentiment via LSTM

In [16]:
df = pd.read_csv('/content/all-data.csv', encoding = "ISO-8859-1")
df['category'] = df['category'].map({'negative': 2, 'neutral': 0, 'positive':1})
df = df[['category','text']]
df.head()

Unnamed: 0,category,text
0,0,"According to Gran , the company has no plans t..."
1,0,Technopolis plans to develop in stages an area...
2,2,The international electronic industry company ...
3,1,With the new production plant the company woul...
4,1,According to the company 's updated strategy f...


0 Neutral Sentiment
1 Postive Sentiment
-1 Negative Sentiment

In [17]:
def tweet_to_words(tweet):
    text = tweet.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    words = text.split()
    words = [w for w in words if w not in stopwords.words("english")]
    words = [PorterStemmer().stem(w) for w in words]
    return words

In [18]:
X = list(map(tweet_to_words, df['text']))
le = LabelEncoder()
Y = le.fit_transform(df['category'])

In [19]:
y = pd.get_dummies(df['category'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=1)

In [20]:
vocabulary_size = 5000

count_vector = CountVectorizer(max_features=vocabulary_size,
                                preprocessor=lambda x: x,
                               tokenizer=lambda x: x) 

X_train = count_vector.fit_transform(X_train).toarray()
X_test = count_vector.transform(X_test).toarray()

In [21]:
max_words = 5000
max_len=50

def tokenize_pad_sequences(text):

    tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
    tokenizer.fit_on_texts(text)

    X = tokenizer.texts_to_sequences(text)
    X = pad_sequences(X, padding='post', maxlen=max_len)

    return X, tokenizer

X, tokenizer = tokenize_pad_sequences(df['text'])

In [22]:
y = pd.get_dummies(df['category'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.05, random_state=1)

In [23]:
def f1_score(precision, recall):
    
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [24]:
vocab_size = 5000
embedding_size = 32
epochs=20
learning_rate = 0.1
decay_rate = learning_rate / epochs
momentum = 0.8

sgd = SGD(lr=learning_rate, momentum=momentum, decay=decay_rate, nesterov=False)

model= Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=max_len))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.4))
model.add(Dense(3, activation='softmax'))

  super(SGD, self).__init__(name, **kwargs)


In [25]:
print(model.summary())

model.compile(loss='categorical_crossentropy', optimizer=sgd, 
               metrics=['accuracy', Precision(), Recall()])

batch_size = 64
history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    batch_size=batch_size, epochs=epochs, verbose=1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 32)            160000    
                                                                 
 conv1d (Conv1D)             (None, 50, 32)            3104      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 25, 32)           0         
 )                                                               
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 3)                 1

In [26]:
loss, accuracy, precision, recall = model.evaluate(X_test, y_test, verbose=0)

print('Accuracy: {:.6f}'.format(accuracy))
print('Precision: {:.6f}'.format(precision))
print('F1 Score: {:.6f}'.format(f1_score(precision, recall)))
print('Recall: {:.6f}'.format(recall))

Accuracy: 0.720165
Precision: 0.737069
F1 Score: 0.720000
Recall: 0.703704


# current news evaluation via best performing finbert-tone

In [None]:
base_uri = "https://cryptonews-api.com/api/v1/category?section=general&date=01012020-04102021&items=50&page=1&extra-fields=rankscore&token=#"

In [None]:
with urllib.request.urlopen("https://cryptonews-api.com/api/v1/category?section=general&date=01012020-04102021&items=50&page=1&extra-fields=rankscore&token=#") as url:
    data = json.loads(url.read().decode())
pages = data["total_pages"]
df = pd.DataFrame(data["data"])

for a in range(2, pages):
  with urllib.request.urlopen(f"https://cryptonews-api.com/api/v1/category?section=general&date=01012020-04102021&items=50&page={a}&extra-fields=rankscore&token=#") as url:
    data = json.loads(url.read().decode())
    dftemp = pd.DataFrame(data["data"])
    df = pd.concat([df, dftemp])
df

In [None]:
data = pd.read_csv("/content/17092021-02072022", sep ='\t')
data1 = pd.read_csv("/content/04102021-09172021", sep ='\t')
data2 = pd.read_csv("/content/12032020-04102021", sep ='\t')

In [None]:
df = pd.concat([data, data1])
df = pd.concat([df, data2])
data = df
data

In [None]:
data['sentiment'] = data['sentiment'].map({'Negative': 2, 'Neutral': 0, 'Positive':1}) 

In [None]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

In [None]:
X = data['title'].to_list()
labels = {0:0, 1:1, 2:2}
sent_val = list()
for x in X:
    inputs = tokenizer(x, return_tensors="pt", padding=True)
    outputs = finbert(**inputs)[0]
   
    val = labels[np.argmax(outputs.detach().numpy())]
    sent_val.append(val)
data['sentimenttitle'] = sent_val

In [None]:
data.to_csv("title_sentiment")

In [None]:
X = data['text'].to_list()

sent_val = list()
for x in X:
    inputs = tokenizer(x, return_tensors="pt", padding=True)
    outputs = finbert(**inputs)[0]
   
    val = labels[np.argmax(outputs.detach().numpy())]
    sent_val.append(val)

data['sentimenttext'] = sent_val

In [None]:
data.to_csv("title_text_sentiment")

In [29]:
df = pd.read_csv("/content/title_text_sentiment.csv")
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,news_url,image_url,title,text,source_name,date,topics,sentiment,type,rank_score,sentimenttitle,sentimenttext
0,0,0,https://dailyhodl.com/2022/02/07/long-time-blo...,https://crypto.snapi.dev/images/v1/7/m/imf-cbd...,Long-Time Blockchain Advocate Congressman Tom ...,"Minnesota Congressman Tom Emmer, former co-cha...",The Daily Hodl,"Mon, 07 Feb 2022 23:40:38 -0500",['regulations'],0,Article,4.45,0,0
1,1,1,https://cointelegraph.com/news/kazakhstan-prop...,https://crypto.snapi.dev/images/v1/8/4/840-ahr...,Kazakhstan proposes power price hikes and taxe...,The central Asian country's government has pro...,Cointelegraph,"Mon, 07 Feb 2022 23:26:40 -0500","['mining', 'taxes']",2,Article,4.71,0,0
2,2,2,https://news.bitcoin.com/us-senator-chinas-dig...,https://crypto.snapi.dev/images/v1/u/2/us-chin...,US Senator: China's Digital Currency Could Sub...,A U.S. senator has warned about China's centra...,Bitcoin,"Mon, 07 Feb 2022 22:00:30 -0500",['regulations'],0,Article,4.54,1,2
3,3,3,https://cointelegraph.com/news/latest-defi-bri...,https://crypto.snapi.dev/images/v1/8/4/840-ahr...,Latest DeFi bridge exploit results in $4.4M lo...,Another token bridge suffered a malicious atta...,Cointelegraph,"Mon, 07 Feb 2022 21:55:30 -0500",[],2,Article,4.44,2,2
4,4,4,https://cryptonews.com/news/chinas-winter-olym...,https://crypto.snapi.dev/images/v1/a/c/webp-ne...,China's Winter Olympic Digital Yuan Gets Frost...,China is continuing in its efforts to wow the ...,Cryptonews,"Mon, 07 Feb 2022 20:00:00 -0500",['digital yuan'],0,Article,3.85,0,2


In [42]:
gb = df.groupby(['sentiment', 'sentimenttitle', 'sentimenttext'])
size = gb.size()
size

sentiment  sentimenttitle  sentimenttext
0          0               0                5097
                           1                 782
                           2                 636
           1               0                 270
                           1                 156
                           2                  62
           2               0                 258
                           1                  53
                           2                 191
1          0               0                5956
                           1                1241
                           2                 297
           1               0                 763
                           1                 779
                           2                 158
           2               0                 114
                           1                  67
                           2                  81
2          0               0                2601
                           1

In [45]:
gb = df.groupby(['source_name', 'sentiment', 'sentimenttitle', 'sentimenttext'])
size = gb.size()
size = pd.DataFrame(size)

In [46]:
size

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0
source_name,sentiment,sentimenttitle,sentimenttext,Unnamed: 4_level_1
8BTC,0,0,0,20
8BTC,0,0,1,3
8BTC,0,0,2,2
8BTC,0,1,0,4
8BTC,0,1,1,2
...,...,...,...,...
Yahoo Finance,1,1,0,6
Yahoo Finance,2,0,0,18
Yahoo Finance,2,1,1,1
Yahoo Finance,2,2,0,12
