In [8]:
import os
import pandas as pd
%matplotlib inline

In [9]:
import nltk
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# News Headlines Sentiment

Use the news api to pull the latest news articles for bitcoin and ethereum and create a DataFrame of sentiment scores for each coin. 

Use descriptive statistics to answer the following questions:
1. Which coin had the highest mean positive score?
2. Which coin had the highest negative score?
3. Which coin had the highest positive score?

In [10]:
#from newsapi import NewsApiClient
from newsapi.newsapi_client import NewsApiClient

In [11]:
# Read your api key environment variable
api_key = os.getenv('NEWS_API_KEY')

In [12]:
# Create a newsapi client
newsapi = NewsApiClient(api_key=api_key)

In [13]:
# Fetch the Bitcoin news articles
btc_news = newsapi.get_everything(q="Bitcoin AND btc", language="en", sort_by="relevancy")

In [14]:
# Fetch the Ethereum news articles
eth_news = newsapi.get_everything(q="Ethereum AND eth", language="en", sort_by="relevancy")

In [28]:
btc_sentiments = []

for article in btc_news["articles"]:
    try:
        sentiment = analyzer.polarity_scores(article["content"])
      
        btc_sentiments.append({
            "raw_article": article["content"],
            "compound": sentiment["compound"],
            "positive": sentiment["pos"],
            "negative": sentiment["neg"],
            "neutral": sentiment["neu"]
            
        })
        
    except AttributeError:
        pass
    
# Create DataFrame
btc_df = pd.DataFrame(btc_sentiments)

# Reorder DataFrame columns
cols = ["raw_article", "compound", "positive", "negative", "neutral"]
btc_df = btc_df[cols]

btc_df.head()

Unnamed: 0,raw_article,compound,positive,negative,neutral
0,Our robot colleague Satoshi Nakaboto writes ab...,0.507,0.067,0.0,0.933
1,"Interest in Bitcoin, the censorship resistant ...",-0.8519,0.062,0.255,0.683
2,The trustee in charge of recovering funds for ...,-0.4215,0.043,0.103,0.855
3,Our robot colleague Satoshi Nakaboto writes ab...,0.2911,0.065,0.042,0.893
4,Our robot colleague Satoshi Nakaboto writes ab...,0.507,0.067,0.0,0.933


In [29]:
# Create the Bitcoin sentiment scores DataFrame

In [30]:
eth_sentiments = []

for article in eth_news["articles"]:
    try:
        sentiment = analyzer.polarity_scores(article["content"])
      
        eth_sentiments.append({
            "raw_article": article["content"],
            "compound": sentiment["compound"],
            "positive": sentiment["pos"],
            "negative": sentiment["neg"],
            "neutral": sentiment["neu"]
            
        })
        
    except AttributeError:
        pass
    
# Create DataFrame
eth_df = pd.DataFrame(eth_sentiments)

# Reorder DataFrame columns
cols = [ "raw_article", "compound", "positive", "negative", "neutral"]
eth_df = eth_df[cols]

eth_df.head()

Unnamed: 0,raw_article,compound,positive,negative,neutral
0,"he plaBy now, you should almost certainly know...",0.4601,0.085,0.0,0.915
1,"Further to its previous blockchain exploits, t...",0.1027,0.063,0.054,0.882
2,"MakerDAO, the decentralized organization that ...",-0.8232,0.055,0.21,0.735
3,"Australias Perth Mint, one of the worlds large...",0.25,0.056,0.033,0.911
4,The total transactional volume of blockchain a...,-0.5095,0.032,0.082,0.886


In [23]:
# Create the ethereum sentiment scores DataFrame

In [24]:
btc_df.describe()

Unnamed: 0,compound,positive,negative,neutral
count,20.0,20.0,20.0,20.0
mean,0.28538,0.07445,0.0394,0.88615
std,0.462738,0.035707,0.069545,0.076406
min,-0.8519,0.0,0.0,0.683
25%,0.17655,0.06575,0.0,0.88275
50%,0.507,0.0675,0.0,0.9285
75%,0.507,0.07025,0.054,0.93225
max,0.7783,0.179,0.255,0.956


In [25]:
# Describe the Bitcoin Sentiment

In [26]:
eth_df.describe()

Unnamed: 0,compound,positive,negative,neutral
count,20.0,20.0,20.0,20.0
mean,0.13816,0.04575,0.02055,0.9337
std,0.354318,0.042626,0.049951,0.069687
min,-0.8232,0.0,0.0,0.735
25%,0.0,0.0,0.0,0.90475
50%,0.1645,0.052,0.0,0.9385
75%,0.3818,0.064,0.008,1.0
max,0.6597,0.144,0.21,1.0


In [71]:
# Describe the Ethereum Sentiment

### Questions:

Q: Which coin had the highest mean positive score?

A: Bitcoin

Q: Which coin had the highest compound score?

A: Bitcoin 

Q. Which coin had the highest positive score?

A: Bitcoin

---

# Tokenizer

In this section, you will use NLTK and Python to tokenize the text for each coin. Be sure to:
1. Lowercase each word
2. Remove Punctuation
3. Remove Stopwords

In [43]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rolop\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [44]:
# Expand the default stop words list if necessary
#list_stopwords = ['',',','`','bitcoin', 'cryptocurrency'] 
# do not necessarily need this for a BITCOIN word cloud.. others showed up in the word cloud...

In [45]:
def tokenizer(raw_article):
    
    #create list
    text = word_tokenize(raw_article)
    
    #covert to lowercase
    text = [word.lower() for word in raw_article]
    
    #remove punctuation
    regex = re.compile("[^a-zA-Z ]")
    text = [regex.sub('', word) for word in text]
    
    # Remove the stop words
    sw = set(stopwords.words('english'))
    
    # Lemmatize Words into root words
    lem = WordNetLemmatizer() #root words
    text = [lem.lemmatize(word) for word in text]
    text = [word for word in text if word not in sw]
    
    return text

In [46]:
# Complete the tokenizer function

    # Create a list of the words

    # Convert the words to lowercase
    
    # Remove the punctuation
    
    # Remove the stop words
    
    # Lemmatize Words into root words
    

In [47]:
btc_df.raw_article

0     Our robot colleague Satoshi Nakaboto writes ab...
1     Interest in Bitcoin, the censorship resistant ...
2     The trustee in charge of recovering funds for ...
3     Our robot colleague Satoshi Nakaboto writes ab...
4     Our robot colleague Satoshi Nakaboto writes ab...
5     Welcome to Hard Fork Basics, a collection of t...
6     Bakkt, the Bitcoin Futures Trading, and Digita...
7     Ohio, the first US state to accept Bitcoin BTC...
8     Our robot colleague Satoshi Nakaboto writes ab...
9     A hacker is holding computer systems belonging...
10    Our robot colleague Satoshi Nakaboto writes ab...
11    Our robot colleague Satoshi Nakaboto writes ab...
12    Our robot colleague Satoshi Nakaboto writes ab...
13    Our robot colleague Satoshi Nakaboto writes ab...
14    Our robot colleague Satoshi Nakaboto writes ab...
15    Our robot colleague Satoshi Nakaboto writes ab...
16    Our robot colleague Satoshi Nakaboto writes ab...
17    Our robot colleague Satoshi Nakaboto write

In [48]:
btc_token = []
[btc_token.append(tokenizer(text)) for text in btc_df.raw_article]   
btc_df['Token'] = btc_token
btc_df.head()

Unnamed: 0,raw_article,compound,positive,negative,neutral,Token
0,Our robot colleague Satoshi Nakaboto writes ab...,0.507,0.067,0.0,0.933,"[u, r, , r, b, , c, l, l, e, g, u, e, , h, ..."
1,"Interest in Bitcoin, the censorship resistant ...",-0.8519,0.062,0.255,0.683,"[n, e, r, e, , n, , b, c, n, , , h, e, , c..."
2,The trustee in charge of recovering funds for ...,-0.4215,0.043,0.103,0.855,"[h, e, , r, u, e, e, , n, , c, h, r, g, e, ..."
3,Our robot colleague Satoshi Nakaboto writes ab...,0.2911,0.065,0.042,0.893,"[u, r, , r, b, , c, l, l, e, g, u, e, , h, ..."
4,Our robot colleague Satoshi Nakaboto writes ab...,0.507,0.067,0.0,0.933,"[u, r, , r, b, , c, l, l, e, g, u, e, , h, ..."


In [44]:
# Create a new tokens column for bitcoin

In [50]:
eth_token = []
[eth_token.append(tokenizer(text)) for text in eth_df.raw_article]   
eth_df['Token'] = eth_token
eth_df.head()

Unnamed: 0,raw_article,compound,positive,negative,neutral,Token
0,"he plaBy now, you should almost certainly know...",0.4601,0.085,0.0,0.915,"[h, e, , p, l, b, , n, w, , , u, , h, u, l..."
1,"Further to its previous blockchain exploits, t...",0.1027,0.063,0.054,0.882,"[f, u, r, h, e, r, , , , p, r, e, v, u, , ..."
2,"MakerDAO, the decentralized organization that ...",-0.8232,0.055,0.21,0.735,"[k, e, r, , , h, e, , e, c, e, n, r, l, z, e..."
3,"Australias Perth Mint, one of the worlds large...",0.25,0.056,0.033,0.911,"[u, r, l, , p, e, r, h, , n, , , n, e, , f..."
4,The total transactional volume of blockchain a...,-0.5095,0.032,0.082,0.886,"[h, e, , l, , r, n, c, n, l, , v, l, u, e, ..."


In [None]:
# Create a new tokens column for ethereum

---

# NGrams and Frequency Analysis

In this section you will look at the ngrams and word frequency for each coin. 

1. Use NLTK to produce the n-grams for N = 2. 
2. List the top 10 words for each coin. 

In [46]:
from collections import Counter
from nltk.corpus import reuters, stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [49]:
btc_bigram_counts = [Counter(ngrams(tokens, n=2)) for tokens in bitcoin_df.Token]
dict(btc_bigram_counts[0].most_common(10))


AttributeError: 'DataFrame' object has no attribute 'Token'

In [50]:
# Generate the Bitcoin N-grams where N=2

In [51]:
eth_bigram_counts = [Counter(ngrams(tokens, n=2)) for tokens in eth_df.Token]
dict(eth_bigram_counts[0].most_common(20))

NameError: name 'eth_df' is not defined

In [52]:
# Generate the Ethereum N-grams where N=2

In [53]:
# Use the token_count function to generate the top 10 words from each coin
def token_count(tokens, N=10):
    """Returns the top N tokens from the frequency count"""
    return Counter(tokens).most_common(N)

In [55]:
for tokens in bitcoin_df.Tokens:
    bigstr += tokens
btc_token_count = token_count(bigstr)
btc_token_count

AttributeError: 'DataFrame' object has no attribute 'Tokens'

In [56]:
# Get the top 10 words for Bitcoin

In [73]:
bigstr = []
for tokens in  ethereum_df.Token:
    bigstr += tokens
eth_token_count = token_count(bigstr)
eth_token_count

AttributeError: 'DataFrame' object has no attribute 'Token'

In [74]:
# Get the top 10 words for Ethereum

# Word Clouds

In this section, you will generate word clouds for each coin to summarize the news for each coin

In [75]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = [20.0, 10.0]

In [77]:
btc_corpus = ''
for token in bitcoin_df.Text:
    btc_corpus += token
str1 = " ".join(str(x) for x in tokenizer(btc_corpus))
wc = WordCloud(collocations=False, background_color='white').generate(str1) #not getting rid of common words called to this function tokenizer...
plt.title('Bitcoin Word Cloud', fontsize=50, fontweight='bold')
plt.style.use('seaborn-whitegrid')
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

TypeError: unsupported operand type(s) for +: 'set' and 'list'

In [None]:
# Generate the Bitcoin word cloud

In [78]:
btc_corpus = ''
for token in bitcoin_df.Text:
    btc_corpus += token
str1 = " ".join(str(x) for x in tokenizer(btc_corpus))
wc = WordCloud(collocations=False, background_color='white').generate(str1) #not getting rid of common words called to this function tokenizer...
plt.title('Bitcoin Word Cloud', fontsize=50, fontweight='bold')
plt.style.use('seaborn-whitegrid')
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

TypeError: unsupported operand type(s) for +: 'set' and 'list'

In [79]:
# Generate the Ethereum word cloud

# Named Entity Recognition

In this section, you will build a named entity recognition model for both coins and visualize the tags using SpaCy.

In [80]:
import spacy
from spacy import displacy

In [81]:
# Optional - download a language model for SpaCy
# !python -m spacy download en_core_web_sm

In [82]:
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

## Bitcoin NER

In [83]:
btc_corpus

"At least that's the idea. While it's not as intensive as mining Bitcoin, running a full node still requires a lot of computing power. In fact, HTC recommends only using the feature while the phone is connected to WiFi and plugged into its power adapter. The c… [+2014 chars]Since its inception in 2009, Bitcoin has made and ruined fortunes, helped sell fentanyl and books about cryptocurrency, withstood literally millions of jokes and just as many predictions of imminent collapse, andthrough a process opaque to most people, myself… [+10529 chars]New allegations against a bitcoin developer spurred a deposition from Jacob Appelbaum\r\nPhoto by Ingo Wagner / Getty Images\r\nIn the summer of 2016, a flood of abuse allegations came out against celebrity Tor developer Jacob Appelbaum, a shocking blow to one of… [+6852 chars]Say it with me: Bitcoin is not anonymous.\xa0\r\nThree hundred and thirty-seven alleged members of a massive dark web child pornography website were reminded of that fact l

In [84]:
# Concatenate all of the bitcoin text together

In [85]:
# Run the NER processor on all of the text
btc_ner = nlp(btc_corpus)
btc_ner.user_data["title"] = "Bitcoin NER"
# Add a title to the document

In [86]:
displacy.render(btc_ner, style='ent')

In [87]:
# Render the visualization

In [88]:
btc_ents = set([ent.text for ent in btc_ner.ents ])
btc_ents #not perfect but is pulling what it considers entities

{'+10529 chars]New allegations',
 '+2872',
 '+2975 chars]Our',
 '+2981',
 '2009',
 'Aristotle',
 'Baudriallard',
 'Bentham',
 'Bitco',
 'Bitcoin',
 'Bitcoin BTC',
 'Bitcoin Pri',
 'Complete Blockchain and Ethereum Programmer',
 'France',
 'French',
 'HTC',
 'Ingo Wagner / Getty Images',
 'Jacob Appelbaum',
 'Le Ministère de lÉducation Nationale',
 'Nietzsche',
 'Planck',
 'Rousseau',
 'Satoshi Nakaboto',
 'Satoshi Nakamoto',
 'Schopenhauer',
 'South Korean',
 'The Next Web',
 'Three hundred and thirty-seven',
 'Time',
 'Today',
 'Tor',
 'U.S.',
 'Video',
 'Welcome',
 'Yolo',
 'de Beauvoir',
 'first',
 'just $24',
 'last year',
 'millions',
 'the past 24 hours',
 'the summer of 2016'}

In [89]:
# List all Entities

---

## Ethereum NER

In [90]:
eth_corpus

NameError: name 'eth_corpus' is not defined

In [91]:
# Concatenate all of the bitcoin text together

In [92]:
# Run the NER processor on all of the text
eth_ner = nlp(eth_corpus)
eth_ner.user_data["title"] = "Ethereum NER"
# Add a title to the document

NameError: name 'eth_corpus' is not defined

In [93]:
displacy.render(eth_ner, style='ent')

NameError: name 'eth_ner' is not defined

In [94]:
# Render the visualization

In [95]:
eth_ents = ([ent.text for ent in eth_ner.ents ])
set(eth_ents)

NameError: name 'eth_ner' is not defined

In [None]:
# List all Entities