In [1]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

model.summary()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [2]:
import tensorflow as tf
import pandas as pd

URL = "https://query.data.world/s/maib4zjodcmqlvzqsk46sxfmqsbavt"

dataset = tf.keras.utils.get_file(fname="BTC_tweets_daily_example.csv", 
                                  origin=URL,
                                  untar=True,
                                  cache_dir='.',
                                  cache_subdir='')

Downloading data from https://query.data.world/s/maib4zjodcmqlvzqsk46sxfmqsbavt
15646720/Unknown - 5s 0us/step

In [3]:
import os
import shutil

main_dir = os.path.join(os.getcwd(), 'dataset')
train_dir = os.path.join(main_dir, 'input')

print(os.listdir(train_dir))
print(os.getcwd())

['BTC_tweets_daily_example.csv', '.ipynb_checkpoints']
/Users/nicholasdimartino/Desktop/CryptoStonks


In [4]:
import numpy as np
import pandas as pd

import os
for main_dir, _, filenames in os.walk(train_dir):
    for filename in filenames:
        print(os.path.join(main_dir, filename))

/Users/nicholasdimartino/Desktop/CryptoStonks/dataset/input/BTC_tweets_daily_example.csv


In [5]:
df_raw = pd.read_csv('/Users/nicholasdimartino/Desktop/CryptoStonks/dataset/input/BTC_tweets_daily_example.csv', delimiter=',', skiprows=0, lineterminator='\n' )
df_raw.head()

Unnamed: 0.1,Unnamed: 0,Date,Tweet,Screen_name,Source,Link,Sentiment,sent_score,New_Sentiment_Score,New_Sentiment_State
0,0,Fri Mar 23 00:40:32 +0000 2018,"RT @ALXTOKEN: Paul Krugman, Nobel Luddite. I h...",myresumerocket,[],"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",['neutral'],0,0.0,0
1,1,Fri Mar 23 00:40:34 +0000 2018,@lopp @_Kevin_Pham @psycho_sage @naval But @Pr...,BitMocro,[u'Bitcoin'],"<a href=""http://twitter.com/download/android"" ...",['neutral'],0,0.0,0
2,2,Fri Mar 23 00:40:35 +0000 2018,RT @tippereconomy: Another use case for #block...,hojachotopur,"[u'blockchain', u'Tipper', u'TipperEconomy']","<a href=""http://twitter.com"" rel=""nofollow"">Tw...",['positive'],1,0.136364,1
3,3,Fri Mar 23 00:40:36 +0000 2018,free coins https://t.co/DiuoePJdap,denies_distro,[],"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",['positive'],1,0.4,1
4,4,Fri Mar 23 00:40:36 +0000 2018,RT @payvxofficial: WE are happy to announce th...,aditzgraha,[],"<a href=""http://twitter.com/download/android"" ...",['positive'],1,0.468182,1


In [6]:
df_raw.columns = ["id", "timestamp", "text", "user", "source", "link", "sentiment", "score", "new_score", "state"]
df_raw.head()

Unnamed: 0,id,timestamp,text,user,source,link,sentiment,score,new_score,state
0,0,Fri Mar 23 00:40:32 +0000 2018,"RT @ALXTOKEN: Paul Krugman, Nobel Luddite. I h...",myresumerocket,[],"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",['neutral'],0,0.0,0
1,1,Fri Mar 23 00:40:34 +0000 2018,@lopp @_Kevin_Pham @psycho_sage @naval But @Pr...,BitMocro,[u'Bitcoin'],"<a href=""http://twitter.com/download/android"" ...",['neutral'],0,0.0,0
2,2,Fri Mar 23 00:40:35 +0000 2018,RT @tippereconomy: Another use case for #block...,hojachotopur,"[u'blockchain', u'Tipper', u'TipperEconomy']","<a href=""http://twitter.com"" rel=""nofollow"">Tw...",['positive'],1,0.136364,1
3,3,Fri Mar 23 00:40:36 +0000 2018,free coins https://t.co/DiuoePJdap,denies_distro,[],"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",['positive'],1,0.4,1
4,4,Fri Mar 23 00:40:36 +0000 2018,RT @payvxofficial: WE are happy to announce th...,aditzgraha,[],"<a href=""http://twitter.com/download/android"" ...",['positive'],1,0.468182,1


In [7]:
df = df_raw[['id', 'timestamp', 'text', 'sentiment', 'score']]
df.sample(5)

Unnamed: 0,id,timestamp,text,sentiment,score
19744,19744,Fri Mar 23 03:59:49 +0000 2018,RT @LitePalOfficial: #Bitcoin &amp; #Litecoin\...,['negative'],-1
11295,11295,Fri Mar 23 02:34:55 +0000 2018,RT @CoinsAirdrops: #Bitcoin #Satoshi #crypto #...,['positive'],1
10135,10135,Fri Mar 23 02:23:18 +0000 2018,RT @WealthE_Coin: As the crypto community grow...,['neutral'],0
25896,25896,Fri Mar 23 05:01:35 +0000 2018,Name: SunContract\nSymbol: SNC\n24 hour change...,['negative'],-1
31276,31276,Fri Mar 23 05:57:26 +0000 2018,@JD0x0 Bitcoin BTG(OFFICIAL) (Bitcoin Gold) do...,['positive'],1


In [12]:
df['date'] = pd.to_datetime(df['timestamp'],format= '%a %b %d %H:%M:%S +0000 %Y').dt.date
df['date']
df.sample(10, random_state = 5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = pd.to_datetime(df['timestamp'],format= '%a %b %d %H:%M:%S +0000 %Y').dt.date


Unnamed: 0,id,timestamp,text,sentiment,score,lang,date
27410,27410,Fri Mar 23 05:17:51 +0000 2018,Long bitcoin? https://t.co/0KGtHxsWPD,['negative'],-1,en,2018-03-23
45029,45029,Fri Mar 23 08:00:02 +0000 2018,"Current Bitcoin Rate in USD : 8,395.5225 Check...",['negative'],-1,en,2018-03-23
17041,17041,Fri Mar 23 03:31:56 +0000 2018,Name: COSS\nSymbol: COSS\n24 hour change: -10....,['negative'],-1,en,2018-03-23
2353,2353,Fri Mar 23 01:05:45 +0000 2018,RT @bethereumteam: We're revealing our surpris...,['positive'],1,en,2018-03-23
23139,23139,Fri Mar 23 04:34:12 +0000 2018,RT @bethereumteam: This month we're celebratin...,['positive'],1,en,2018-03-23
18110,18110,Fri Mar 23 03:42:45 +0000 2018,RT @RandolphMlny: #Bitcoin #Satoshi #cryptocur...,['positive'],1,en,2018-03-23
45925,45925,Fri Mar 23 08:07:31 +0000 2018,RT @Excellion: In a few months we’ll see more ...,['positive'],1,en,2018-03-23
28241,28241,Fri Mar 23 05:27:24 +0000 2018,RT @DrDenaGrayson: 🛑BREAKING🛑\n\nRockin' Rod R...,['positive'],1,en,2018-03-23
18564,18564,Fri Mar 23 03:47:28 +0000 2018,RT @BitschoolAI: We are very pleased to announ...,['positive'],1,en,2018-03-23
40697,40697,Fri Mar 23 07:22:47 +0000 2018,RT @bethereumteam: Create custom group bets an...,['positive'],1,en,2018-03-23


In [3]:
pip install whatthelang

Processing /Users/nicholasdimartino/Library/Caches/pip/wheels/26/11/37/b27d8f98b142afacc5fcd476197fc8aea2e438efe98f25dbf9/whatthelang-1.0.1-py3-none-any.whl
Installing collected packages: whatthelang
Successfully installed whatthelang-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [13]:
from whatthelang import WhatTheLang
wtl = WhatTheLang()
result = [wtl.predict_lang(row) for row in df['text']]
df['lang'] = result
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lang'] = result


Unnamed: 0,id,timestamp,text,sentiment,score,lang,date
0,0,Fri Mar 23 00:40:32 +0000 2018,"RT @ALXTOKEN: Paul Krugman, Nobel Luddite. I h...",['neutral'],0,en,2018-03-23
1,1,Fri Mar 23 00:40:34 +0000 2018,@lopp @_Kevin_Pham @psycho_sage @naval But @Pr...,['neutral'],0,en,2018-03-23
2,2,Fri Mar 23 00:40:35 +0000 2018,RT @tippereconomy: Another use case for #block...,['positive'],1,en,2018-03-23
3,3,Fri Mar 23 00:40:36 +0000 2018,free coins https://t.co/DiuoePJdap,['positive'],1,en,2018-03-23
4,4,Fri Mar 23 00:40:36 +0000 2018,RT @payvxofficial: WE are happy to announce th...,['positive'],1,en,2018-03-23
