In [1]:
import requests      
from datetime import datetime
from bs4 import BeautifulSoup  
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
import re, csv, string 
import gensim
import pandas as pd
from gensim import corpora, models
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np
import dateutil.parser
from nltk.sentiment.vader import SentimentIntensityAnalyzer

  from pandas.core import datetools


# Defined functions to scrape Bitcoin news from news.bitcoin.com and bloomberg.com and to fetch headlines and date to be used later in sentiment analysis

In [2]:
#Separate headlines out from 'all_data_raw' variable so that it can be written to separate columns in csv

def get_headlines(data, index):
    
    headlines = []
    
    for row in data:
        headlines.append(row[index])
        
    return headlines

#Separate dates out from 'all_data' variable so that it can be written to separate columns in csv

def get_date(data, index):
    
    date = []
    
    for row in data:
        date.append(row[index])
        
    return date

#Handle cleaning of one specific date format

def tokenize_date(t):
    
    pattern=r'[a-zA-Z{3}]+[.\s]+[\d{1,2}\,\s]+[\d{4}]+'   

    date_pattern = nltk.regexp_tokenize(t, pattern)
    
    return date_pattern

#Get Bitcoin news from news.bitcoin.com

def get_bitcoinNews():
    
    headlines=[]  #List to store headlines
    dates = []    #List to store date
    raw_headlines = []
    page_number = 1
    page_url="https://news.bitcoin.com/page/"+str(page_number)+"/?s=bitcoin"
    
    while page_url!="https://news.bitcoin.com/page/140/?s=bitcoin":
        
        if page_number % 20 == 0:
            print('Scraped %d of 140 pages' % page_number)
            
        page_url="https://news.bitcoin.com/page/"+str(page_number)+"/?s=bitcoin"        
        page = requests.get(page_url) 
        page_number += 1
        
        if page.status_code!=200:  
            page_number = None
        else:                   
            soup = BeautifulSoup(page.content, 'html.parser')                        
            
            main_content = soup.find('div', class_ = 'td-ss-main-content')            
            h3s = main_content.find_all('h3', class_ = "entry-title td-module-title")
            span_dates = main_content.find_all("span", class_ = "td-post-date")
            
            for idx, h3 in enumerate(h3s):
                header = h3.select('a')
                
                if header != []:
                    headline = header[0].get_text().lower()
                    raw_headlines.append(headline)
            
            for idx, span in enumerate(span_dates):
                dates_list = span.select('time')
                
                if dates_list != []:
                    date = dates_list[0].get_text()
                    dates.append(date)
    
    raw_data = zip(raw_headlines, dates)
    return raw_data

#Get Bitcoin news from bloomberg.com

def get_bloomberg():
    
    headlines=[]  #List to store headlines
    dates = []    #List to store date
    raw_headlines = [] 
    page_number = 1
    page_url="https://www.bloomberg.com/search?query=bitcoin&sort=time:desc&endTime=2017-11-29T17:35:17.135Z&page="+str(page_number)
    
    while page_url!="https://www.bloomberg.com/search?query=bitcoin&sort=time:desc&endTime=2017-11-29T17:35:17.135Z&page=80":     

        if page_number % 20 == 0:
            print('Scraped %d of 80 pages' % page_number)
            
        page_url="https://www.bloomberg.com/search?query=bitcoin&sort=time:desc&endTime=2017-11-29T17:35:17.135Z&page="+str(page_number)
        page = requests.get(page_url) 
        page_number += 1
        
        if page.status_code!=200:  
            page_number = 80
        else:                   
            soup = BeautifulSoup(page.content, 'html.parser')                        
            
            for header in soup.find_all('h1', class_ ='search-result-story__headline'):
                
                headline = header.get_text().lower()
                raw_headlines.append(headline)
                
            for date in soup.find_all('time', class_ = 'published-at'):
                
                date_published = date.get_text()
                date_published = date_published.lstrip()
                dates.append(date_published)
                 
    raw_data = zip(raw_headlines, dates)
    return raw_data

# Calling the functions defined above to scrape the data and then store it in the list "all_data_raw"

In [3]:
if __name__ == "__main__":
    
    print("Scraping bitcoin news from news.bitcoin.com")
    print("*******************************************\n")
    bitcoinNews_raw = get_bitcoinNews()
    
    print("\nScraping bitcoin news from bloomberg.com")
    print("*******************************************\n")
    bloomberg_raw = get_bloomberg()
    
    all_data_raw = []
    
    #Appending the headlines and date to the list
    
    all_data_raw.extend(list(bitcoinNews_raw))
    all_data_raw.extend(list(bloomberg_raw))
    
    print("\nDone scraping all data and stored in list 'all_data_raw'")

Scraping bitcoin news from news.bitcoin.com
*******************************************

Scraped 20 of 140 pages
Scraped 40 of 140 pages
Scraped 60 of 140 pages
Scraped 80 of 140 pages
Scraped 100 of 140 pages
Scraped 120 of 140 pages
Scraped 140 of 140 pages

Scraping bitcoin news from bloomberg.com
*******************************************

Scraped 20 of 80 pages
Scraped 40 of 80 pages
Scraped 60 of 80 pages
Scraped 80 of 80 pages

Done scraping all data and stored in list 'all_data_raw'


# Classifying the headlines as positive, negative and neutral and printing the results

In [4]:
#Filter out headlines without these keywords

keywords = ['bitcoin', 'cryptocurrency','cryptocurrencies', 'crypto', 'blockchain', 'blockchains']

#Get headlines and dates as individual lists

headlines = get_headlines(list(filter(lambda x: any(word in x[0] for word in keywords), all_data_raw)),0)
dates = get_date(list(filter(lambda x: any(word in x[0] for word in keywords), all_data_raw)),1)
sid = SentimentIntensityAnalyzer()
compound = []

for head in headlines:
    
    head_lower = head.lower()
    ss = sid.polarity_scores(head_lower)
    compound.append(ss['compound'])

#Counting the number of headlines in each sentiment class

neutral = []
somewhat_negative = []
somewhat_positive = []
very_negative = []
very_positive = []

for index, score in enumerate(compound):
    
    if score > -0.20 and score < 0.20:
        neutral.append(score)
        
    elif score > -0.60 and score < -0.20:
        somewhat_negative.append(score)
        
    elif score > 0.20 and score < 0.60:   
        somewhat_positive.append(score)
        
    elif score <= -0.60:
        very_negative.append(score)
        
    else:
        very_positive.append(score)

print('Neutral headlines: ', len(neutral))
print('Somewhat negative headlines: ', len(somewhat_negative))
print("Very negative headlines: ", len(very_negative))
print('Somewhat positive headlines: ', len(somewhat_positive))
print("Very positive headlines: ", len(very_positive))
print('Total number of headlines: ', len(compound))

data_with_sentiment = list(zip(dates, headlines, compound))

Neutral headlines:  1166
Somewhat negative headlines:  252
Very negative headlines:  50
Somewhat positive headlines:  328
Very positive headlines:  52
Total number of headlines:  1848


# Tokenizing, lemmatizing and removing frequent keywords to improve clustering performance

In [5]:
#Tokenize and lemmatize headlines

def get_wordnet_pos(pos_tag):
    
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    
    else:
        return wordnet.NOUN
    
stop_words = stopwords.words('english')

def lemmatize(document):
    
    pattern=r'[a-zA-Z]+[a-zA-Z\-]+[a-zA-Z]'      
    tokens=nltk.regexp_tokenize(document, pattern)
    tagged_tokens=nltk.pos_tag(tokens)
    wordnet_lemmatizer=WordNetLemmatizer()
    
    le_words=[wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(tag)) \
              for (word, tag) in tagged_tokens \
              if word not in stop_words and \
              word not in string.punctuation]
    
    le_words = list(set(le_words))
    return le_words

lem_headlines = []

for headline in headlines:
    
    lem_headline = lemmatize(headline)
    lem_headlines.append(lem_headline)

lem_data_with_sentiment = list(zip(dates, headlines, lem_headlines, compound))

#Function to remove frequent keywords from lemmatized headline tokens to improve clustering performance

def remove_keywords(data):
    for row in data:
        lem_list = row[2]
        for word in lem_list:
            if word in keywords:
                lem_list.remove(word)
        
    return data

#Preparing data for LDA clustering

data_for_clustering = remove_keywords(lem_data_with_sentiment)

# Building LDA clustering model for 5 clusters and storing the data in "Cleaned_data_final_5clusters_btc.csv"

In [6]:
#Building LDA Model for 5 clusters

headline = get_headlines(data_for_clustering,2)
dictionary = corpora.Dictionary(headline)
corpus = [dictionary.doc2bow(row) for row in headline]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word = dictionary, passes=20)
print(ldamodel.print_topics(num_topics=5, num_words=5))

#Classifying clusters and mapping them to headlines

cluster = ldamodel[corpus]
clusters = []
list_ = []

for t in cluster:
    list_ = (list(zip(*t))[1])
    clusters.append(list_.index(max(list_)))

date, headline, tokenized_headline, sentiment = zip(*data_for_clustering)
clean_data_5clusters = list(zip(date, headline, tokenized_headline, sentiment, clusters))

#Displaying the headlines within each cluster

filter(lambda x: x[1] == 0, clean_data_5clusters)
filter(lambda x: x[1] == 1, clean_data_5clusters)
filter(lambda x: x[1] == 2, clean_data_5clusters)
filter(lambda x: x[1] == 3, clean_data_5clusters)
filter(lambda x: x[1] == 4, clean_data_5clusters)

#Writing cleaned data to csv for 5 clusters

data_frame_5clusters = pd.DataFrame.from_records(clean_data_5clusters, columns = ["Date", "Headline", "Tokenized Headline", "Sentiment", "Cluster"])
data_frame_5clusters.to_csv("Cleaned_data_final_5clusters_btc.csv", encoding='utf-8')
print("\nAll data cleaned, clustered and written to csv file 'Cleaned_data_final_5clusters_btc.csv'")

[(0, '0.037*"market" + 0.018*"price" + 0.017*"update" + 0.010*"new" + 0.009*"cash"'), (1, '0.015*"say" + 0.011*"cash" + 0.010*"bank" + 0.010*"big" + 0.008*"south"'), (2, '0.034*"cash" + 0.015*"new" + 0.013*"exchange" + 0.009*"gold" + 0.008*"network"'), (3, '0.015*"exchange" + 0.014*"launch" + 0.012*"fund" + 0.011*"future" + 0.008*"cash"'), (4, '0.016*"say" + 0.010*"fork" + 0.010*"bank" + 0.009*"may" + 0.009*"bubble"')]

All data cleaned, clustered and written to csv file 'Cleaned_data_final_5clusters_btc.csv'


# Building LDA clustering model for 4 clusters and storing the data in "Cleaned_data_final_4clusters_btc.csv"

In [7]:
#Building LDA Model for 4 clusters
 
headline_list = get_headlines(data_for_clustering,2)
dictionary = corpora.Dictionary(headline_list)
corpus = [dictionary.doc2bow(row) for row in headline_list]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=4, id2word = dictionary, passes=20)
print(ldamodel.print_topics(num_topics=4, num_words=5))

#Classifying clusters and mapping them to headlines

cluster = ldamodel[corpus]
clusters = []
list_ = []

for t in cluster:
    list_ = (list(zip(*t))[1])
    clusters.append(list_.index(max(list_)))

date, headline, tokenized_headline, sentiment = zip(*data_for_clustering)
clean_data_4clusters = list(zip(date, headline, tokenized_headline, sentiment, clusters))

#Displaying the headlines within each cluster

filter(lambda x: x[1] == 0, clean_data_4clusters)
filter(lambda x: x[1] == 1, clean_data_4clusters)
filter(lambda x: x[1] == 2, clean_data_4clusters)
filter(lambda x: x[1] == 3, clean_data_4clusters)

#Writing cleaned data to csv for 4 clusters

data_frame_4clusters = pd.DataFrame.from_records(clean_data_4clusters, columns = ["Date", "Headline", "Tokenized Headline", "Sentiment", "Cluster"])
data_frame_4clusters.to_csv("Cleaned_data_final_4clusters_btc.csv", encoding='utf-8')
print("\nAll data cleaned, clustered and written to csv file 'Cleaned_data_final_4clusters_btc.csv'")

[(0, '0.026*"market" + 0.014*"update" + 0.013*"price" + 0.010*"future" + 0.007*"million"'), (1, '0.027*"cash" + 0.011*"new" + 0.009*"fork" + 0.009*"gold" + 0.008*"bank"'), (2, '0.021*"cash" + 0.013*"exchange" + 0.010*"say" + 0.006*"fund" + 0.006*"bank"'), (3, '0.014*"exchange" + 0.011*"launch" + 0.010*"new" + 0.007*"first" + 0.006*"future"')]

All data cleaned, clustered and written to csv file 'Cleaned_data_final_4clusters_btc.csv'


# Generating pivot tables based on dates

In [8]:
#Generating pivot tables by date

pivot_table_5 = pd.pivot_table(data_frame_5clusters, values = 'Sentiment', index = 'Date', columns = 'Cluster', aggfunc = np.sum, fill_value = 0)
pivot_table_4 = pd.pivot_table(data_frame_4clusters, values = 'Sentiment', index = 'Date', columns = 'Cluster', aggfunc = np.sum, fill_value = 0)

pivot_table_5 = pivot_table_5.rename_axis(None, axis =1).reset_index()
pivot_table_4 = pivot_table_4.rename_axis(None, axis =1).reset_index()

# Merging pivot tables with bitcoin pricing data stored in files "Bitcoin1Day.csv" and "Bitcoin3Day.csv". These files have data from Kaggle

In [9]:
#Merge pivot tables with bitcoin pricing data

with open("Bitcoin1Day.csv", "r") as f:
    
    reader=csv.reader(f, delimiter=',') 
    bitcoin_1day = [row for row in reader]
    del bitcoin_1day[0]

date_list = []

for date in get_date(bitcoin_1day,0):
    date = datetime.strptime(date, '%m/%d/%Y')
    date = datetime.strftime(date,'%b %d, %Y')
    date_list.append(date)
    
date, dailyreturn = zip(*bitcoin_1day)
bitcoin_1day = zip(date_list,dailyreturn)
    
bitcoin_1day = pd.DataFrame.from_records(bitcoin_1day, columns = ["Date","DailyReturn"])
bitcoin_1day

bitcoin_1day_5clusters = bitcoin_1day.merge(pivot_table_5, on = 'Date', how = 'left')
bitcoin_1day_5clusters = bitcoin_1day_5clusters.dropna()

bitcoin_1day_4clusters = bitcoin_1day.merge(pivot_table_4, on = 'Date', how = 'left')
bitcoin_1day_4clusters = bitcoin_1day_4clusters.dropna()

with open("Bitcoin3Day.csv", "r") as f:
    
    reader=csv.reader(f, delimiter=',') 
    bitcoin_3day = [row for row in reader]
    del bitcoin_3day[0]

date_list = []

for date in get_date(bitcoin_3day,0):
    date = datetime.strptime(date, '%m/%d/%Y')
    date = datetime.strftime(date,'%b %d, %Y')
    date_list.append(date)
    
date, dailyreturn = zip(*bitcoin_3day)
bitcoin_3day = zip(date_list,dailyreturn)
    
bitcoin_3day = pd.DataFrame.from_records(bitcoin_3day, columns = ["Date","DailyReturn"])

bitcoin_3day_5clusters = bitcoin_3day.merge(pivot_table_5, on = 'Date', how = 'left')
bitcoin_3day_5clusters = bitcoin_3day_5clusters.dropna()

bitcoin_3day_4clusters = bitcoin_3day.merge(pivot_table_4, on = 'Date', how = 'left')
bitcoin_3day_4clusters = bitcoin_3day_4clusters.dropna()

# Performing Linear Regression 

In [10]:
#1 day 5 clusters

x_train = bitcoin_1day_5clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = bitcoin_1day_5clusters['DailyReturn']

model_1_5 = sm.OLS(y_train.astype(float), x_train.astype(float))
results_1_5 = model_1_5.fit()
print(results_1_5.summary())
print('\n******************************************************************************************************\n')

#1 day 4 clusters

x_train = bitcoin_1day_4clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = bitcoin_1day_4clusters['DailyReturn']

model_1_4 = sm.OLS(y_train.astype(float), x_train.astype(float))
results_1_4 = model_1_4.fit()
print(results_1_4.summary())
print('\n******************************************************************************************************\n')

#3 day 5 clusters

x_train = bitcoin_3day_5clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = bitcoin_3day_5clusters['DailyReturn']

model_3_5 = sm.OLS(y_train.astype(float), x_train.astype(float))
results_3_5 = model_3_5.fit()
print(results_3_5.summary())
print('\n******************************************************************************************************\n')
pred = results_3_5.predict(x_train.astype(float))

#3 day 4 clusters

x_train = bitcoin_3day_4clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = bitcoin_3day_4clusters['DailyReturn']

model_3_4 = sm.OLS(y_train.astype(float), x_train.astype(float))
results_3_4 = model_3_4.fit()
print(results_3_4.summary())
print('\n******************************************************************************************************\n')

bitcoin_1day_5clusters_binary = []

for value in bitcoin_1day_5clusters['DailyReturn']:
    if float(value) > 0:
        value_binary = 1
    else:
        value_binary = 0
        
    bitcoin_1day_5clusters_binary.append(value_binary)
    
bitcoin_1day_4clusters_binary = []

for value in bitcoin_1day_4clusters['DailyReturn']:
    if float(value) > 0:
        value_binary = 1
    else:
        value_binary = 0
        
    bitcoin_1day_4clusters_binary.append(value_binary)
    
bitcoin_3day_5clusters_binary = []

for value in bitcoin_3day_5clusters['DailyReturn']:
    if float(value) > 0:
        value_binary = 1
    else:
        value_binary = 0
        
    bitcoin_3day_5clusters_binary.append(value_binary)
    
bitcoin_3day_4clusters_binary = []

for value in bitcoin_3day_4clusters['DailyReturn']:
    if float(value) > 0:
        value_binary = 1
    else:
        value_binary = 0
        
    bitcoin_3day_4clusters_binary.append(value_binary)

                            OLS Regression Results                            
Dep. Variable:            DailyReturn   R-squared:                       0.041
Model:                            OLS   Adj. R-squared:                 -0.006
Method:                 Least Squares   F-statistic:                    0.8697
Date:                Fri, 27 Apr 2018   Prob (F-statistic):              0.504
Time:                        00:00:26   Log-Likelihood:                -318.37
No. Observations:                 107   AIC:                             648.7
Df Residuals:                     101   BIC:                             664.8
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.1218      0.503      2.232      0.0

# Performing Logistic Regression

In [11]:
LogReg = LogisticRegression()

#1 day 5 clusters

x_train = bitcoin_1day_5clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = bitcoin_1day_5clusters_binary

model_1_5_log = LogReg.fit(x_train.astype(float),y_train)
y_pred = model_1_5_log.predict(x_train.astype(float))
print(classification_report(y_pred, y_train))

#1 day 4 clusters

x_train = bitcoin_1day_4clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = bitcoin_1day_4clusters_binary

model_1_4_log = LogReg.fit(x_train.astype(float),y_train)
y_pred = model_1_4_log.predict(x_train.astype(float))
print(classification_report(y_pred, y_train))

#3 day 5 clusters

x_train = bitcoin_3day_5clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = bitcoin_3day_5clusters_binary

model_3_5_log = LogReg.fit(x_train.astype(float),y_train)
y_pred = model_3_5_log.predict(x_train.astype(float))
print(classification_report(y_pred, y_train))

#3 day 4 clusters

x_train = bitcoin_3day_4clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = bitcoin_3day_4clusters_binary

model_3_4_log = LogReg.fit(x_train.astype(float),y_train)
y_pred = model_3_4_log.predict(x_train.astype(float))
print(classification_report(y_pred, y_train))

             precision    recall  f1-score   support

          0       0.21      0.59      0.31        17
          1       0.88      0.58      0.70        90

avg / total       0.77      0.58      0.64       107

             precision    recall  f1-score   support

          0       0.31      0.58      0.41        26
          1       0.81      0.59      0.69        81

avg / total       0.69      0.59      0.62       107

             precision    recall  f1-score   support

          0       0.08      1.00      0.14         1
          1       1.00      0.65      0.79        34

avg / total       0.97      0.66      0.77        35

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       1.00      0.63      0.77        35

avg / total       1.00      0.63      0.77        35



  'recall', 'true', average, warn_for)


# Outputing the actual vs predicted returns in "Actual_Predicted_1day_5clust_btc.csv", "Actual_Predicted_1day_4clust_btc.csv", "Actual_Predicted_3day_5clust_btc.csv" and "Actual_Predicted_3day_4clust_btc.csv"

In [12]:
#Actual vs Predicted value comparison
#1 day 5 clusters

actual = bitcoin_1day_5clusters['DailyReturn'].tolist()
predicted = pred.tolist()
actual_predicted = list(zip(actual, predicted))

actual_predicted = pd.DataFrame.from_records(actual_predicted, columns = ["Actual Return", "Predicted Return"])
actual_predicted.to_csv("Actual_Predicted_1day_5clust_btc.csv", encoding='utf-8')

#1 day 4 clusters

actual = bitcoin_1day_4clusters['DailyReturn'].tolist()
predicted = pred.tolist()
actual_predicted = list(zip(actual, predicted))

actual_predicted = pd.DataFrame.from_records(actual_predicted, columns = ["Actual Return", "Predicted Return"])
actual_predicted.to_csv("Actual_Predicted_1day_4clust_btc.csv", encoding='utf-8')

#3 day 5 clusters

actual = bitcoin_3day_5clusters['DailyReturn'].tolist()
predicted = pred.tolist()
actual_predicted = list(zip(actual, predicted))

actual_predicted = pd.DataFrame.from_records(actual_predicted, columns = ["Actual Return", "Predicted Return"])
actual_predicted.to_csv("Actual_Predicted_3day_5clust_btc.csv", encoding='utf-8')

#3 day 4 clusters

actual = bitcoin_3day_4clusters['DailyReturn'].tolist()
predicted = pred.tolist()
actual_predicted = list(zip(actual, predicted))

actual_predicted = pd.DataFrame.from_records(actual_predicted, columns = ["Actual Return", "Predicted Return"])
actual_predicted.to_csv("Actual_Predicted_3day_4clust_btc.csv", encoding='utf-8')