In [2]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

model.summary()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [3]:
import tensorflow as tf
import pandas as pd

URL = "https://query.data.world/s/maib4zjodcmqlvzqsk46sxfmqsbavt"

dataset = tf.keras.utils.get_file(fname="BTC_tweets_daily_example.csv", 
                                  origin=URL,
                                  untar=True,
                                  cache_dir='.',
                                  cache_subdir='')

Downloading data from https://query.data.world/s/maib4zjodcmqlvzqsk46sxfmqsbavt
15646720/Unknown - 3s 0us/step

In [4]:
import os
import shutil

main_dir = os.path.join(os.getcwd(), 'dataset')
train_dir = os.path.join(main_dir, 'input')

print(os.listdir(train_dir))
print(os.getcwd())

['BTC_tweets_daily_example.csv', '.ipynb_checkpoints']
/Users/nicholasdimartino/Desktop/CryptoStonks


In [5]:
import numpy as np
import pandas as pd

import os
for main_dir, _, filenames in os.walk(train_dir):
    for filename in filenames:
        print(os.path.join(main_dir, filename))

/Users/nicholasdimartino/Desktop/CryptoStonks/dataset/input/BTC_tweets_daily_example.csv


In [6]:
df_raw = pd.read_csv('/Users/nicholasdimartino/Desktop/CryptoStonks/dataset/input/BTC_tweets_daily_example.csv', delimiter=',', skiprows=0, lineterminator='\n' )
df_raw.head()

Unnamed: 0.1,Unnamed: 0,Date,Tweet,Screen_name,Source,Link,Sentiment,sent_score,New_Sentiment_Score,New_Sentiment_State
0,0,Fri Mar 23 00:40:32 +0000 2018,"RT @ALXTOKEN: Paul Krugman, Nobel Luddite. I h...",myresumerocket,[],"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",['neutral'],0,0.0,0
1,1,Fri Mar 23 00:40:34 +0000 2018,@lopp @_Kevin_Pham @psycho_sage @naval But @Pr...,BitMocro,[u'Bitcoin'],"<a href=""http://twitter.com/download/android"" ...",['neutral'],0,0.0,0
2,2,Fri Mar 23 00:40:35 +0000 2018,RT @tippereconomy: Another use case for #block...,hojachotopur,"[u'blockchain', u'Tipper', u'TipperEconomy']","<a href=""http://twitter.com"" rel=""nofollow"">Tw...",['positive'],1,0.136364,1
3,3,Fri Mar 23 00:40:36 +0000 2018,free coins https://t.co/DiuoePJdap,denies_distro,[],"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",['positive'],1,0.4,1
4,4,Fri Mar 23 00:40:36 +0000 2018,RT @payvxofficial: WE are happy to announce th...,aditzgraha,[],"<a href=""http://twitter.com/download/android"" ...",['positive'],1,0.468182,1


In [7]:
df_raw.columns = ["id", "timestamp", "text", "user", "source", "link", "sentiment", "score", "new_score", "state"]
df_raw.head()

Unnamed: 0,id,timestamp,text,user,source,link,sentiment,score,new_score,state
0,0,Fri Mar 23 00:40:32 +0000 2018,"RT @ALXTOKEN: Paul Krugman, Nobel Luddite. I h...",myresumerocket,[],"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",['neutral'],0,0.0,0
1,1,Fri Mar 23 00:40:34 +0000 2018,@lopp @_Kevin_Pham @psycho_sage @naval But @Pr...,BitMocro,[u'Bitcoin'],"<a href=""http://twitter.com/download/android"" ...",['neutral'],0,0.0,0
2,2,Fri Mar 23 00:40:35 +0000 2018,RT @tippereconomy: Another use case for #block...,hojachotopur,"[u'blockchain', u'Tipper', u'TipperEconomy']","<a href=""http://twitter.com"" rel=""nofollow"">Tw...",['positive'],1,0.136364,1
3,3,Fri Mar 23 00:40:36 +0000 2018,free coins https://t.co/DiuoePJdap,denies_distro,[],"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",['positive'],1,0.4,1
4,4,Fri Mar 23 00:40:36 +0000 2018,RT @payvxofficial: WE are happy to announce th...,aditzgraha,[],"<a href=""http://twitter.com/download/android"" ...",['positive'],1,0.468182,1


In [8]:
df = df_raw[['id', 'timestamp', 'text', 'sentiment', 'score']]
df.sample(5)

Unnamed: 0,id,timestamp,text,sentiment,score
26895,26895,Fri Mar 23 05:12:36 +0000 2018,RT @coindesk: Coinbase Is In Talks to Buy One ...,['positive'],1
42721,42721,Fri Mar 23 07:39:28 +0000 2018,What's actually crazy is that I tweeted/posted...,['negative'],-1
31851,31851,Fri Mar 23 06:02:44 +0000 2018,Bitcoin falls after Japan warns largest operat...,['neutral'],0
14183,14183,Fri Mar 23 03:03:40 +0000 2018,"RT @Seigi44: Doing 10,000 $XVG giveaway in 3 d...",['neutral'],0
2490,2490,Fri Mar 23 01:07:16 +0000 2018,@_crypto_iceman @Coinneo1 @andreastzav @TheRea...,['positive'],1


In [9]:
df['date'] = pd.to_datetime(df['timestamp'],format= '%a %b %d %H:%M:%S +0000 %Y').dt.date
df['date']
df.sample(10, random_state = 5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = pd.to_datetime(df['timestamp'],format= '%a %b %d %H:%M:%S +0000 %Y').dt.date


Unnamed: 0,id,timestamp,text,sentiment,score,date
27410,27410,Fri Mar 23 05:17:51 +0000 2018,Long bitcoin? https://t.co/0KGtHxsWPD,['negative'],-1,2018-03-23
45029,45029,Fri Mar 23 08:00:02 +0000 2018,"Current Bitcoin Rate in USD : 8,395.5225 Check...",['negative'],-1,2018-03-23
17041,17041,Fri Mar 23 03:31:56 +0000 2018,Name: COSS\nSymbol: COSS\n24 hour change: -10....,['negative'],-1,2018-03-23
2353,2353,Fri Mar 23 01:05:45 +0000 2018,RT @bethereumteam: We're revealing our surpris...,['positive'],1,2018-03-23
23139,23139,Fri Mar 23 04:34:12 +0000 2018,RT @bethereumteam: This month we're celebratin...,['positive'],1,2018-03-23
18110,18110,Fri Mar 23 03:42:45 +0000 2018,RT @RandolphMlny: #Bitcoin #Satoshi #cryptocur...,['positive'],1,2018-03-23
45925,45925,Fri Mar 23 08:07:31 +0000 2018,RT @Excellion: In a few months we’ll see more ...,['positive'],1,2018-03-23
28241,28241,Fri Mar 23 05:27:24 +0000 2018,RT @DrDenaGrayson: 🛑BREAKING🛑\n\nRockin' Rod R...,['positive'],1,2018-03-23
18564,18564,Fri Mar 23 03:47:28 +0000 2018,RT @BitschoolAI: We are very pleased to announ...,['positive'],1,2018-03-23
40697,40697,Fri Mar 23 07:22:47 +0000 2018,RT @bethereumteam: Create custom group bets an...,['positive'],1,2018-03-23


In [10]:
pip install whatthelang

Note: you may need to restart the kernel to use updated packages.


In [11]:
from whatthelang import WhatTheLang
wtl = WhatTheLang()
result = [wtl.predict_lang(row) for row in df['text']]
df['lang'] = result
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lang'] = result


Unnamed: 0,id,timestamp,text,sentiment,score,date,lang
0,0,Fri Mar 23 00:40:32 +0000 2018,"RT @ALXTOKEN: Paul Krugman, Nobel Luddite. I h...",['neutral'],0,2018-03-23,en
1,1,Fri Mar 23 00:40:34 +0000 2018,@lopp @_Kevin_Pham @psycho_sage @naval But @Pr...,['neutral'],0,2018-03-23,en
2,2,Fri Mar 23 00:40:35 +0000 2018,RT @tippereconomy: Another use case for #block...,['positive'],1,2018-03-23,en
3,3,Fri Mar 23 00:40:36 +0000 2018,free coins https://t.co/DiuoePJdap,['positive'],1,2018-03-23,en
4,4,Fri Mar 23 00:40:36 +0000 2018,RT @payvxofficial: WE are happy to announce th...,['positive'],1,2018-03-23,en


In [12]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, train_size=0.80)
print(train)
print(test)

          id                       timestamp  \
15942  15942  Fri Mar 23 03:21:19 +0000 2018   
44399  44399  Fri Mar 23 07:54:25 +0000 2018   
11613  11613  Fri Mar 23 02:38:49 +0000 2018   
27347  27347  Fri Mar 23 05:17:13 +0000 2018   
22293  22293  Fri Mar 23 04:25:46 +0000 2018   
...      ...                             ...   
2440    2440  Fri Mar 23 01:06:39 +0000 2018   
8980    8980  Fri Mar 23 02:12:03 +0000 2018   
26482  26482  Fri Mar 23 05:07:24 +0000 2018   
14792  14792  Fri Mar 23 03:09:35 +0000 2018   
1392    1392  Fri Mar 23 00:55:40 +0000 2018   

                                                    text     sentiment  score  \
15942  In a few months we’ll see more functional #LAp...  ['positive']      1   
44399  RT @GymRewards: https://t.co/Bm9sIxiiwU  Check...   ['neutral']      0   
11613  RT @starflowcom: What are the advantages of us...  ['positive']      1   
27347  RT @MetatronInc: Thanks again https://t.co/DIT...  ['positive']      1   
22293  RT @Aruwba:

In [13]:
InputExample(guid=None,
             text_a = "Hello, world",
             text_b = None,
             label = 1)

InputExample(guid=None, text_a='Hello, world', text_b=None, label=1)

In [14]:
def convert_data_to_examples(train, test, text, score): 
  train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[text], 
                                                          text_b = None,
                                                          label = x[score]), axis = 1)

  validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[text], 
                                                          text_b = None,
                                                          label = x[score]), axis = 1)
  
  return train_InputExamples, validation_InputExamples

  train_InputExamples, validation_InputExamples = convert_data_to_examples(train, 
                                                                           test, 
                                                                           'text', 
                                                                           'score')
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )

text  =  "text"
score  =  "score"


In [15]:
train_InputExamples , validation_InputExamples  =  convert_data_to_examples ( train , test , text , score )
train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)



In [16]:
print(train_InputExamples)

15942    InputExample(guid=None, text_a='In a few month...
44399    InputExample(guid=None, text_a='RT @GymRewards...
11613    InputExample(guid=None, text_a='RT @starflowco...
27347    InputExample(guid=None, text_a='RT @MetatronIn...
22293    InputExample(guid=None, text_a="RT @Aruwba: Hu...
                               ...                        
2440     InputExample(guid=None, text_a='Cryptocurrency...
8980     InputExample(guid=None, text_a='#Crypto update...
26482    InputExample(guid=None, text_a='mine #Bitcoin ...
14792    InputExample(guid=None, text_a='RT @RamenCoin2...
1392     InputExample(guid=None, text_a='RT @DrDenaGray...
Length: 40687, dtype: object


In [21]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import numpy as np
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/nicholasdimartino/nltk_data...


True

In [36]:
sid = SentimentIntensityAnalyzer()
text = 'I am happy'
score = sid.polarity_scores(text)
print(score["compound"])

0.5719
