In [57]:
import os
import pandas as pd
%matplotlib inline

In [58]:
import nltk
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# News Headlines Sentiment

Use the news api to pull the latest news articles for bitcoin and ethereum and create a DataFrame of sentiment scores for each coin. 

Use descriptive statistics to answer the following questions:
1. Which coin had the highest mean positive score?
2. Which coin had the highest negative score?
3. Which coin had the highest positive score?

In [59]:
#from newsapi import NewsApiClient
from newsapi.newsapi_client import NewsApiClient

In [60]:
# Read your api key environment variable
api_key = os.getenv('NEWS_API_KEY')

In [61]:
# Create a newsapi client
newsapi = NewsApiClient(api_key=api_key)

In [62]:
# Fetch the Bitcoin news articles
bitcoin_news = newsapi.get_everything(q="Bitcoin", language="en", sort_by="relevancy")

In [63]:
# Fetch the Ethereum news articles
ethereum_news = newsapi.get_everything(q="Ethereum", language="en", sort_by="relevancy")

In [64]:
bitcoin_sentiments = []

for article in bitcoin_news["articles"]:
    try:
        sentiment = analyzer.polarity_scores(article["content"])
      
        bitcoin_sentiments.append({
            "Text": article["content"],
            "compound": sentiment["compound"],
            "positive": sentiment["pos"],
            "negative": sentiment["neg"],
            "neutral": sentiment["neu"]
            
        })
        
    except AttributeError:
        pass
    
# Create DataFrame
bitcoin_df = pd.DataFrame(bitcoin_sentiments)

# Reorder DataFrame columns
cols = ["Text", "compound", "positive", "negative", "neutral"]
bitcoin_df = bitcoin_df[cols]

bitcoin_df.head()

Unnamed: 0,Text,compound,positive,negative,neutral
0,At least that's the idea. While it's not as in...,0.2263,0.04,0.0,0.96
1,"Since its inception in 2009, Bitcoin has made ...",-0.6486,0.044,0.139,0.817
2,New allegations against a bitcoin developer sp...,-0.7845,0.0,0.154,0.846
3,Say it with me: Bitcoin is not anonymous. \r\n...,-0.4767,0.0,0.069,0.931
4,Federal authorities in the U.S. have unsealed ...,0.0258,0.061,0.08,0.859


In [65]:
# Create the Bitcoin sentiment scores DataFrame

In [66]:
ethereum_sentiments = []

for article in ethereum_news["articles"]:
    try:
        sentiment = analyzer.polarity_scores(article["content"])
      
        ethereum_sentiments.append({
            "Text": article["content"],
            "compound": sentiment["compound"],
            "positive": sentiment["pos"],
            "negative": sentiment["neg"],
            "neutral": sentiment["neu"]
            
        })
        
    except AttributeError:
        pass
    
# Create DataFrame
ethereum_df = pd.DataFrame(ethereum_sentiments)

# Reorder DataFrame columns
cols = [ "Text", "compound", "positive", "negative", "neutral"]
ethereum_df = ethereum_df[cols]

ethereum_df.head()

Unnamed: 0,Text,compound,positive,negative,neutral
0,TL;DR: The in-depth Complete Blockchain and Et...,0.4404,0.059,0.0,0.941
1,IKEA Iceland has used Ethereum ETHtechnology t...,0.3818,0.067,0.0,0.933
2,Disrupt Berlin is right around the corner. And...,0.6249,0.1,0.0,0.9
3,At least that's the idea. While it's not as in...,0.2263,0.04,0.0,0.96
4,"he plaBy now, you should almost certainly know...",0.4601,0.085,0.0,0.915


In [67]:
# Create the ethereum sentiment scores DataFrame

In [68]:
bitcoin_df.describe()

Unnamed: 0,compound,positive,negative,neutral
count,20.0,20.0,20.0,20.0
mean,0.27828,0.05935,0.0242,0.91645
std,0.435938,0.031147,0.048259,0.041475
min,-0.7845,0.0,0.0,0.817
25%,0.176175,0.05525,0.0,0.89125
50%,0.507,0.0675,0.0,0.931
75%,0.507,0.069,0.0105,0.93225
max,0.6879,0.117,0.154,1.0


In [69]:
# Describe the Bitcoin Sentiment

In [70]:
ethereum_df.describe()

Unnamed: 0,compound,positive,negative,neutral
count,19.0,19.0,19.0,19.0
mean,0.110674,0.055368,0.039053,0.905579
std,0.432916,0.044727,0.054351,0.069962
min,-0.8232,0.0,0.0,0.735
25%,-0.15445,0.016,0.0,0.893
50%,0.2263,0.059,0.0,0.915
75%,0.45025,0.0785,0.065,0.9375
max,0.6318,0.173,0.21,1.0


In [71]:
# Describe the Ethereum Sentiment

### Questions:

Q: Which coin had the highest mean positive score?

A: Bitcoin

Q: Which coin had the highest compound score?

A: Bitcoin 

Q. Which coin had the highest positive score?

A: Bitcoin

---

# Tokenizer

In this section, you will use NLTK and Python to tokenize the text for each coin. Be sure to:
1. Lowercase each word
2. Remove Punctuation
3. Remove Stopwords

In [37]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

In [38]:
# Expand the default stop words list if necessary
list_stopwords = ['',',','`','bitcoin', 'cryptocurrency'] 
# do not necessarily need this for a BITCOIN word cloud.. others showed up in the word cloud...

In [39]:
def tokenizer(text):
    
    #create list
    text = word_tokenize(text)
    
    #covert to lowercase
    text = [word.lower() for word in text]
    
    #remove punctuation
    regex = re.compile("[^a-zA-Z ]")
    text = [regex.sub('', word) for word in text]
    
    # Remove the stop words
    sw = set(stopwords.words('english')) + list_stopwords
    
    # Lemmatize Words into root words
    lem = WordNetLemmatizer() #root words
    text = [lem.lemmatize(word) for word in text]
    text = [word for word in text if word not in sw]
    
    return text

In [40]:
# Complete the tokenizer function

    # Create a list of the words

    # Convert the words to lowercase
    
    # Remove the punctuation
    
    # Remove the stop words
    
    # Lemmatize Words into root words
    

In [41]:
bitcoin_df.Text

0     At least that's the idea. While it's not as in...
1     Since its inception in 2009, Bitcoin has made ...
2     New allegations against a bitcoin developer sp...
3     Say it with me: Bitcoin is not anonymous. \r\n...
4     Federal authorities in the U.S. have unsealed ...
5     High school students in France may be among th...
6     TL;DR: The in-depth Complete Blockchain and Et...
7     Our robot colleague Satoshi Nakaboto writes ab...
8     Our robot colleague Satoshi Nakaboto writes ab...
9     Our robot colleague Satoshi Nakaboto writes ab...
10    Our robot colleague Satoshi Nakaboto writes ab...
11    Our robot colleague Satoshi Nakaboto writes ab...
12    Our robot colleague Satoshi Nakaboto writes ab...
13    Our robot colleague Satoshi Nakaboto writes ab...
14    Our robot colleague Satoshi Nakaboto writes ab...
15    Our robot colleague Satoshi Nakaboto writes ab...
16    Our robot colleague Satoshi Nakaboto writes ab...
17    Our robot colleague Satoshi Nakaboto write

In [42]:
bitcoin_token = []
[bitcoin_token.append(tokenizer(text)) for text in bitcoin_df.Text]   
bitcoin_df['Token'] = bitcoin_token
bitcoin_df.head()

TypeError: unsupported operand type(s) for +: 'set' and 'list'

In [43]:
# word tokenize all sentences
word_tokenized = []

for story in sentence_tokenized:
    # get all for each article, which is already sentence tokenized
    words = []
    for sent in story:
        words = words + word_tokenize(sent)
    # append all words for each article to the word_tokenized list
    word_tokenized.append(words)
    

NameError: name 'sentence_tokenized' is not defined

In [44]:
# Create a new tokens column for bitcoin

In [45]:
ethereum_token = []
[ethereum_token.append(tokenizer(text)) for text in ethereum_df.Text]   
ethereum_df['Token'] = ethereum_token
ethereum_df.head()

TypeError: unsupported operand type(s) for +: 'set' and 'list'

In [None]:
# Create a new tokens column for ethereum

---

# NGrams and Frequency Analysis

In this section you will look at the ngrams and word frequency for each coin. 

1. Use NLTK to produce the n-grams for N = 2. 
2. List the top 10 words for each coin. 

In [46]:
from collections import Counter
from nltk.corpus import reuters, stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [49]:
btc_bigram_counts = [Counter(ngrams(tokens, n=2)) for tokens in bitcoin_df.Token]
dict(btc_bigram_counts[0].most_common(10))


AttributeError: 'DataFrame' object has no attribute 'Token'

In [50]:
# Generate the Bitcoin N-grams where N=2

In [51]:
eth_bigram_counts = [Counter(ngrams(tokens, n=2)) for tokens in eth_df.Token]
dict(eth_bigram_counts[0].most_common(20))

NameError: name 'eth_df' is not defined

In [52]:
# Generate the Ethereum N-grams where N=2

In [53]:
# Use the token_count function to generate the top 10 words from each coin
def token_count(tokens, N=10):
    """Returns the top N tokens from the frequency count"""
    return Counter(tokens).most_common(N)

In [55]:
for tokens in bitcoin_df.Tokens:
    bigstr += tokens
btc_token_count = token_count(bigstr)
btc_token_count

AttributeError: 'DataFrame' object has no attribute 'Tokens'

In [56]:
# Get the top 10 words for Bitcoin

In [73]:
bigstr = []
for tokens in  ethereum_df.Token:
    bigstr += tokens
eth_token_count = token_count(bigstr)
eth_token_count

AttributeError: 'DataFrame' object has no attribute 'Token'

In [74]:
# Get the top 10 words for Ethereum

# Word Clouds

In this section, you will generate word clouds for each coin to summarize the news for each coin

In [75]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = [20.0, 10.0]

In [77]:
btc_corpus = ''
for token in bitcoin_df.Text:
    btc_corpus += token
str1 = " ".join(str(x) for x in tokenizer(btc_corpus))
wc = WordCloud(collocations=False, background_color='white').generate(str1) #not getting rid of common words called to this function tokenizer...
plt.title('Bitcoin Word Cloud', fontsize=50, fontweight='bold')
plt.style.use('seaborn-whitegrid')
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

TypeError: unsupported operand type(s) for +: 'set' and 'list'

In [None]:
# Generate the Bitcoin word cloud

In [78]:
btc_corpus = ''
for token in bitcoin_df.Text:
    btc_corpus += token
str1 = " ".join(str(x) for x in tokenizer(btc_corpus))
wc = WordCloud(collocations=False, background_color='white').generate(str1) #not getting rid of common words called to this function tokenizer...
plt.title('Bitcoin Word Cloud', fontsize=50, fontweight='bold')
plt.style.use('seaborn-whitegrid')
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

TypeError: unsupported operand type(s) for +: 'set' and 'list'

In [79]:
# Generate the Ethereum word cloud

# Named Entity Recognition

In this section, you will build a named entity recognition model for both coins and visualize the tags using SpaCy.

In [80]:
import spacy
from spacy import displacy

In [81]:
# Optional - download a language model for SpaCy
# !python -m spacy download en_core_web_sm

In [82]:
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

## Bitcoin NER

In [83]:
btc_corpus

"At least that's the idea. While it's not as intensive as mining Bitcoin, running a full node still requires a lot of computing power. In fact, HTC recommends only using the feature while the phone is connected to WiFi and plugged into its power adapter. The c… [+2014 chars]Since its inception in 2009, Bitcoin has made and ruined fortunes, helped sell fentanyl and books about cryptocurrency, withstood literally millions of jokes and just as many predictions of imminent collapse, andthrough a process opaque to most people, myself… [+10529 chars]New allegations against a bitcoin developer spurred a deposition from Jacob Appelbaum\r\nPhoto by Ingo Wagner / Getty Images\r\nIn the summer of 2016, a flood of abuse allegations came out against celebrity Tor developer Jacob Appelbaum, a shocking blow to one of… [+6852 chars]Say it with me: Bitcoin is not anonymous.\xa0\r\nThree hundred and thirty-seven alleged members of a massive dark web child pornography website were reminded of that fact l

In [84]:
# Concatenate all of the bitcoin text together

In [85]:
# Run the NER processor on all of the text
btc_ner = nlp(btc_corpus)
btc_ner.user_data["title"] = "Bitcoin NER"
# Add a title to the document

In [86]:
displacy.render(btc_ner, style='ent')

In [87]:
# Render the visualization

In [88]:
btc_ents = set([ent.text for ent in btc_ner.ents ])
btc_ents #not perfect but is pulling what it considers entities

{'+10529 chars]New allegations',
 '+2872',
 '+2975 chars]Our',
 '+2981',
 '2009',
 'Aristotle',
 'Baudriallard',
 'Bentham',
 'Bitco',
 'Bitcoin',
 'Bitcoin BTC',
 'Bitcoin Pri',
 'Complete Blockchain and Ethereum Programmer',
 'France',
 'French',
 'HTC',
 'Ingo Wagner / Getty Images',
 'Jacob Appelbaum',
 'Le Ministère de lÉducation Nationale',
 'Nietzsche',
 'Planck',
 'Rousseau',
 'Satoshi Nakaboto',
 'Satoshi Nakamoto',
 'Schopenhauer',
 'South Korean',
 'The Next Web',
 'Three hundred and thirty-seven',
 'Time',
 'Today',
 'Tor',
 'U.S.',
 'Video',
 'Welcome',
 'Yolo',
 'de Beauvoir',
 'first',
 'just $24',
 'last year',
 'millions',
 'the past 24 hours',
 'the summer of 2016'}

In [89]:
# List all Entities

---

## Ethereum NER

In [90]:
eth_corpus

NameError: name 'eth_corpus' is not defined

In [91]:
# Concatenate all of the bitcoin text together

In [92]:
# Run the NER processor on all of the text
eth_ner = nlp(eth_corpus)
eth_ner.user_data["title"] = "Ethereum NER"
# Add a title to the document

NameError: name 'eth_corpus' is not defined

In [93]:
displacy.render(eth_ner, style='ent')

NameError: name 'eth_ner' is not defined

In [94]:
# Render the visualization

In [95]:
eth_ents = ([ent.text for ent in eth_ner.ents ])
set(eth_ents)

NameError: name 'eth_ner' is not defined

In [None]:
# List all Entities