In [1]:
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as expected_conditions
import requests      
from datetime import datetime
from bs4 import BeautifulSoup  
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
import re, csv, string 
import gensim
import pandas as pd
from gensim import corpora, models
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np
import dateutil.parser
from nltk.sentiment.vader import SentimentIntensityAnalyzer

  from pandas.core import datetools


# Defined functions to scrape Ethereum news from coindesk.com and to fetch headlines and date to be used later in sentiment analysis

In [2]:
#Separate headlines out from 'all_data_raw' variable so that it can be written to separate columns in csv

def get_headlines(data, index):
    
    headlines = []
    
    for row in data:
        headlines.append(row[index])
        
    return headlines

#Separate dates out from 'all_data' variable so that it can be written to separate columns in csv

def get_date(data, index):
    
    date = []
    
    for row in data:
        date.append(row[index])
        
    return date

#Handle cleaning of one specific date format

def tokenize_date(t):
    
    pattern=r'[a-zA-Z{3}]+[.\s]+[\d{1,2}\,\s]+[\d{4}]+'
    
    date_pattern = nltk.regexp_tokenize(t, pattern)
    
    return date_pattern

#Get Ethereum news from coindesk.com

def get_ethereumNews():
    
    new=[]
    url = "https://www.coindesk.com/category/technology-news/ethereum-technology-news/"
    executable_path = 'C:/Users/Rushabh Vakharia/geckodriver-v0.19.1-win64/geckodriver'
    service_log_path = 'C:/Users/Rushabh Vakharia/geckodriver-v0.19.1-win64/driver'
    
    driver = webdriver.Firefox(executable_path=executable_path)

    driver.get(url)
    html = driver.page_source.encode('utf-8')
    more_path='div#byscripts_ajax_posts_loader_trigger'
    longItemStr=[]
    
    for i in range(50):
        
        WebDriverWait(driver,10).until(expected_conditions.visibility_of_element_located((By.CSS_SELECTOR, more_path)))
        driver.execute_script("arguments[0].click();", WebDriverWait(driver,10).until(expected_conditions.visibility_of_element_located((By.CSS_SELECTOR, more_path))))
    
        headline=driver.find_elements_by_class_name("fade")  
        dates=driver.find_elements_by_css_selector("time")
        print("Getting page number "+str(i))

        for head in headline:
            if head.text!='' and head.text not in longItemStr:  
                longItemStr.append(head.text)
                    
        for date in dates:
            #if date not in new:
            date=str(dateutil.parser.parse(date.text).date())
            date=datetime.strptime(date,"%Y-%m-%d").strftime("%m/%d/%Y")
            dt_obj = datetime.strptime(date,'%m/%d/%Y')
            date= datetime.strftime(dt_obj,'%b %d, %Y')
            #dt_obj = datetime.strptime(date,'%m/%d/%Y')
            new.append(date)
                 
    raw_data = zip(longItemStr, new)
    driver.quit()
    return raw_data

# Calling the functions defined above to scrape the data and then store it in the list "all_data_raw"

In [3]:
if __name__ == "__main__":
    
    print("Scraping ethereum news from coindesk.com")
    print("*******************************************\n")
    ethereum_raw = get_ethereumNews()
    
    all_data_raw = []
    all_data_raw.extend(list(ethereum_raw))
    
    print("\nDone scraping all data and stored in list 'all_data_raw'")

Scraping ethereum news from coindesk.com
*******************************************

Getting page number 0
Getting page number 1
Getting page number 2
Getting page number 3
Getting page number 4
Getting page number 5
Getting page number 6
Getting page number 7
Getting page number 8
Getting page number 9
Getting page number 10
Getting page number 11
Getting page number 12
Getting page number 13
Getting page number 14
Getting page number 15
Getting page number 16
Getting page number 17
Getting page number 18
Getting page number 19
Getting page number 20
Getting page number 21
Getting page number 22
Getting page number 23
Getting page number 24
Getting page number 25
Getting page number 26
Getting page number 27
Getting page number 28
Getting page number 29
Getting page number 30
Getting page number 31
Getting page number 32
Getting page number 33
Getting page number 34
Getting page number 35
Getting page number 36
Getting page number 37
Getting page number 38
Getting page number 39
Gett

# Classifying the headlines as positive, negative and neutral and printing the results

In [6]:
#Filter out headlines without these keywords

keywords = ['ethereum', 'cryptocurrency','cryptocurrencies', 'crypto', 'blockchain', 'blockchains']

#Get headlines and dates as individual lists

headlines = get_headlines(all_data_raw,0)
dates = get_date(all_data_raw,1)
sid = SentimentIntensityAnalyzer()       
compound = []

for head in headlines:
    head_lower = head.lower()
    ss = sid.polarity_scores(head_lower)
    compound.append(ss['compound'])
    
#Counting the number of headlines in each sentiment class

neutral = []
somewhat_negative = []
somewhat_positive = []
very_negative = []
very_positive = []

for index, score in enumerate(compound):
    
    if score > -0.20 and score < 0.20:
        neutral.append(score)
        
    elif score > -0.60 and score < -0.20:
        somewhat_negative.append(score)
        
    elif score > 0.20 and score < 0.60:
        somewhat_positive.append(score)
        
    elif score <= -0.60:
        very_negative.append(score)
        
    else:
        very_positive.append(score)

print('Neutral headlines: ', len(neutral))
print('Somewhat negative headlines: ', len(somewhat_negative))
print("Very negative headlines: ", len(very_negative))
print('Somewhat positive headlines: ', len(somewhat_positive))
print("Very positive headlines: ", len(very_positive))
print('Total number of headlines: ', len(compound))

data_with_sentiment = list(zip(dates, headlines, compound))

Neutral headlines:  235
Somewhat negative headlines:  46
Very negative headlines:  7
Somewhat positive headlines:  71
Very positive headlines:  10
Total number of headlines:  369


# Tokenizing, lemmatizing and removing frequent keywords to improve clustering performance

In [7]:
#Tokenize and lemmatize headlines

def get_wordnet_pos(pos_tag):
    
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    
    else:
        return wordnet.NOUN
    
stop_words = stopwords.words('english')

def lemmatize(document):
    
    pattern=r'[a-zA-Z]+[a-zA-Z\-]+[a-zA-Z]'      
    tokens=nltk.regexp_tokenize(document, pattern)
    tagged_tokens=nltk.pos_tag(tokens)
    wordnet_lemmatizer=WordNetLemmatizer()
    
    le_words=[wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(tag)) \
              for (word, tag) in tagged_tokens \
              if word not in stop_words and \
              word not in string.punctuation]
    
    le_words = list(set(le_words))
    return le_words

lem_headlines = []

for headline in headlines:
    
    lem_headline = lemmatize(headline)
    lem_headlines.append(lem_headline)

lem_data_with_sentiment = list(zip(dates, headlines, lem_headlines, compound))

#Function to remove frequent keywords from lemmatized headline tokens to improve clustering performance

def remove_keywords(data):
    for row in data:
        lem_list = row[2]
        for word in lem_list:
            if word in keywords:
                lem_list.remove(word)
        
    return data

#Preparing data for LDA clustering

data_for_clustering = remove_keywords(lem_data_with_sentiment)

# Building LDA clustering model for 5 clusters and storing the data in "Cleaned_data_final_5clusters_eth.csv"

In [12]:
#Building LDA Model for 5 clusters

headline = get_headlines(data_for_clustering,2)
dictionary = corpora.Dictionary(headline)
corpus = [dictionary.doc2bow(row) for row in headline]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word = dictionary, passes=20)
print(ldamodel.print_topics(num_topics=5, num_words=5))

#Classifying clusters and mapping them to headlines

cluster = ldamodel[corpus]
clusters = []
list_ = []

for t in cluster:
    list_ = (list(zip(*t))[1])
    clusters.append(list_.index(max(list_)))

date, headline, tokenized_headline, sentiment = zip(*data_for_clustering)
clean_data_5clusters = list(zip(date, headline, tokenized_headline, sentiment, clusters))

#Displaying the headlines within each cluster

filter(lambda x: x[1] == 0, clean_data_5clusters)
filter(lambda x: x[1] == 1, clean_data_5clusters)
filter(lambda x: x[1] == 2, clean_data_5clusters)
filter(lambda x: x[1] == 3, clean_data_5clusters)
filter(lambda x: x[1] == 4, clean_data_5clusters)

#Writing cleaned data to csv for 5 clusters

data_frame_5clusters = pd.DataFrame.from_records(clean_data_5clusters, columns = ["Date", "Headline", "Tokenized Headline", "Sentiment", "Cluster"])
data_frame_5clusters.to_csv("Cleaned_data_final_5clusters_eth.csv", encoding='utf-8')
print("\nAll data cleaned, clustered and written to csv file 'Cleaned_data_final_5clusters_eth.csv'")

[(0, '0.086*"Ethereum" + 0.039*"Blockchain" + 0.015*"Fork" + 0.014*"New" + 0.009*"Developers"'), (1, '0.038*"Ethereum" + 0.030*"Ether" + 0.015*"Blockchain" + 0.011*"Parity" + 0.011*"Bitcoin"'), (2, '0.051*"Ethereum" + 0.013*"The" + 0.009*"Cryptocurrency" + 0.009*"Now" + 0.008*"Blockchain"'), (3, '0.051*"Ethereum" + 0.019*"New" + 0.016*"Blockchain" + 0.009*"Fork" + 0.008*"The"'), (4, '0.061*"Ethereum" + 0.025*"The" + 0.020*"Vitalik" + 0.019*"DAO" + 0.015*"Buterin"')]

All data cleaned, clustered and written to csv file 'Cleaned_data_final_5clusters_eth.csv'


# Building LDA clustering model for 4 clusters and storing the data in "Cleaned_data_final_4clusters_eth.csv"

In [13]:
#Building LDA Model for 4 clusters

headline = get_headlines(data_for_clustering,2)
dictionary = corpora.Dictionary(headline)
corpus = [dictionary.doc2bow(row) for row in headline]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=4, id2word = dictionary, passes=20)
print(ldamodel.print_topics(num_topics=4, num_words=5))

#Classifying clusters and mapping them to headlines

cluster = ldamodel[corpus]
clusters = []
list_ = []

for t in cluster:
    list_ = (list(zip(*t))[1])
    clusters.append(list_.index(max(list_)))

date, headline, tokenized_headline, sentiment = zip(*data_for_clustering)
clean_data_5clusters = list(zip(date, headline, tokenized_headline, sentiment, clusters))

#Displaying the headlines within each cluster

filter(lambda x: x[1] == 0, clean_data_5clusters)
filter(lambda x: x[1] == 1, clean_data_5clusters)
filter(lambda x: x[1] == 2, clean_data_5clusters)
filter(lambda x: x[1] == 3, clean_data_5clusters)

#Writing cleaned data to csv for 4 clusters

data_frame_4clusters = pd.DataFrame.from_records(clean_data_5clusters, columns = ["Date", "Headline", "Tokenized Headline", "Sentiment", "Cluster"])
data_frame_4clusters.to_csv("Cleaned_data_final_4clusters_eth.csv", encoding='utf-8')
print("\nAll data cleaned, clustered and written to csv file 'Cleaned_data_final_4clusters_eth.csv'")

[(0, '0.055*"Ethereum" + 0.021*"Blockchain" + 0.016*"Ether" + 0.011*"New" + 0.011*"The"'), (1, '0.034*"Ethereum" + 0.017*"Blockchain" + 0.013*"Over" + 0.012*"Million" + 0.010*"DAO"'), (2, '0.074*"Ethereum" + 0.026*"Blockchain" + 0.012*"DAO" + 0.012*"The" + 0.011*"Fork"'), (3, '0.070*"Ethereum" + 0.013*"Blockchain" + 0.013*"Launch" + 0.010*"New" + 0.009*"Are"')]

All data cleaned, clustered and written to csv file 'Cleaned_data_final_4clusters_eth.csv'


# Generating pivot tables based on dates

In [14]:
#Generating pivot tables by date

pivot_table_5 = pd.pivot_table(data_frame_5clusters, values = 'Sentiment', index = 'Date', columns = 'Cluster', aggfunc = np.sum, fill_value = 0)
pivot_table_4 = pd.pivot_table(data_frame_4clusters, values = 'Sentiment', index = 'Date', columns = 'Cluster', aggfunc = np.sum, fill_value = 0)

pivot_table_5 = pivot_table_5.rename_axis(None, axis =1).reset_index()
pivot_table_4 = pivot_table_4.rename_axis(None, axis =1).reset_index()

# Merging pivot tables with ethereum pricing data stored in files "Ethereum1Day.csv" and "Ethereum3Day.csv". These files have data from Kaggle

In [15]:
#Merge pivot tables with bitcoin pricing data

with open("Ethereum1Day.csv", "r") as f:
    
    reader=csv.reader(f, delimiter=',') 
    ethereum_1day = [row for row in reader]
    del ethereum_1day[0]

date_list = []

for date in get_date(ethereum_1day,0):
    date = datetime.strptime(date, '%m/%d/%Y')
    date = datetime.strftime(date,'%b %d, %Y')
    date_list.append(date)
    
date, dailyreturn = zip(*ethereum_1day)
ethereum_1day = zip(date_list,dailyreturn)
    
ethereum_1day = pd.DataFrame.from_records(ethereum_1day, columns = ["Date","DailyReturn"])
ethereum_1day

ethereum_1day_5clusters = ethereum_1day.merge(pivot_table_5, on = 'Date', how = 'left')
ethereum_1day_5clusters = ethereum_1day_5clusters.dropna()

ethereum_1day_4clusters = ethereum_1day.merge(pivot_table_4, on = 'Date', how = 'left')
ethereum_1day_4clusters = ethereum_1day_4clusters.dropna()

with open("Ethereum3Day.csv", "r") as f:
    
    reader=csv.reader(f, delimiter=',') 
    ethereum_3day = [row for row in reader]
    del ethereum_3day[0]

date_list = []

for date in get_date(ethereum_3day,0):
    date = datetime.strptime(date, '%m/%d/%Y')
    date = datetime.strftime(date,'%b %d, %Y')
    date_list.append(date)
    
date, dailyreturn = zip(*ethereum_3day)
ethereum_3day = zip(date_list,dailyreturn)
    
ethereum_3day = pd.DataFrame.from_records(ethereum_3day, columns = ["Date","DailyReturn"])

ethereum_3day_5clusters = ethereum_3day.merge(pivot_table_5, on = 'Date', how = 'left')
ethereum_3day_5clusters = ethereum_3day_5clusters.dropna()

ethereum_3day_4clusters = ethereum_3day.merge(pivot_table_4, on = 'Date', how = 'left')
ethereum_3day_4clusters = ethereum_3day_4clusters.dropna()

# Performing Linear Regression

In [16]:
#1 day 5 clusters

x_train = ethereum_1day_5clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = ethereum_1day_5clusters['DailyReturn']

model_1_5 = sm.OLS(y_train.astype(float), x_train.astype(float))
results_1_5 = model_1_5.fit()
print(results_1_5.summary())
print('\n******************************************************************************************************\n')

#1 day 4 clusters

x_train = ethereum_1day_4clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = ethereum_1day_4clusters['DailyReturn']

model_1_4 = sm.OLS(y_train.astype(float), x_train.astype(float))
results_1_4 = model_1_4.fit()
print(results_1_4.summary())
print('\n******************************************************************************************************\n')

#3 day 5 clusters

x_train = ethereum_3day_5clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = ethereum_3day_5clusters['DailyReturn']

model_3_5 = sm.OLS(y_train.astype(float), x_train.astype(float))
results_3_5 = model_3_5.fit()
print(results_3_5.summary())
print('\n******************************************************************************************************\n')
pred = results_3_5.predict(x_train.astype(float))

#3 day 4 clusters

x_train = ethereum_3day_4clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = ethereum_3day_5clusters['DailyReturn']

model_3_4 = sm.OLS(y_train.astype(float), x_train.astype(float))
results_3_4 = model_3_4.fit()
print(results_3_4.summary())
print('\n******************************************************************************************************\n')

ethereum_1day_5clusters_binary = []

for value in ethereum_1day_5clusters['DailyReturn']:
    if float(value) > 0:
        value_binary = 1
    else:
        value_binary = 0
        
    ethereum_1day_5clusters_binary.append(value_binary)
    
ethereum_1day_4clusters_binary = []

for value in ethereum_1day_4clusters['DailyReturn']:
    if float(value) > 0:
        value_binary = 1
    else:
        value_binary = 0
        
    ethereum_1day_4clusters_binary.append(value_binary)
    
ethereum_3day_5clusters_binary = []

for value in ethereum_3day_5clusters['DailyReturn']:
    if float(value) > 0:
        value_binary = 1
    else:
        value_binary = 0
        
    ethereum_3day_5clusters_binary.append(value_binary)
    
ethereum_3day_4clusters_binary = []

for value in ethereum_3day_4clusters['DailyReturn']:
    if float(value) > 0:
        value_binary = 1
    else:
        value_binary = 0
        
    ethereum_3day_4clusters_binary.append(value_binary)

                            OLS Regression Results                            
Dep. Variable:            DailyReturn   R-squared:                       0.155
Model:                            OLS   Adj. R-squared:                 -0.043
Method:                 Least Squares   F-statistic:                    0.7825
Date:                Fri, 27 Apr 2018   Prob (F-statistic):              0.552
Time:                        00:09:56   Log-Likelihood:                -70.319
No. Observations:                  22   AIC:                             150.6
Df Residuals:                      17   BIC:                             156.1
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.3506      1.784      1.878      0.0

  return np.sqrt(eigvals[0]/eigvals[-1])
  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


# Performing Logistic Regression

In [17]:
LogReg = LogisticRegression()

#1 day 5 clusters

x_train = ethereum_1day_5clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = ethereum_1day_5clusters_binary

model_1_5_log = LogReg.fit(x_train.astype(float),y_train)
y_pred = model_1_5_log.predict(x_train.astype(float))
print(classification_report(y_pred, y_train))

#1 day 4 clusters

x_train = ethereum_1day_4clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = ethereum_1day_4clusters_binary

model_1_4_log = LogReg.fit(x_train.astype(float),y_train)
y_pred = model_1_4_log.predict(x_train.astype(float))
print(classification_report(y_pred, y_train))

#3 day 5 clusters

x_train = ethereum_3day_5clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = ethereum_3day_5clusters_binary

model_3_5_log = LogReg.fit(x_train.astype(float),y_train)
y_pred = model_3_5_log.predict(x_train.astype(float))
print(classification_report(y_pred, y_train))

#3 day 4 clusters

x_train = ethereum_3day_4clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = ethereum_3day_4clusters_binary

model_3_4_log = LogReg.fit(x_train.astype(float),y_train)
y_pred = model_3_4_log.predict(x_train.astype(float))
print(classification_report(y_pred, y_train))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       1.00      0.59      0.74        22

avg / total       1.00      0.59      0.74        22

             precision    recall  f1-score   support

          0       0.11      1.00      0.20         1
          1       1.00      0.62      0.76        21

avg / total       0.96      0.64      0.74        22

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       1.00      0.73      0.84        22

avg / total       1.00      0.73      0.84        22

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       1.00      0.73      0.84        22

avg / total       1.00      0.73      0.84        22



  'recall', 'true', average, warn_for)


# Outputing the actual vs predicted returns in "Actual_Predicted_1day_5clust_eth.csv", "Actual_Predicted_1day_4clust_eth.csv", "Actual_Predicted_3day_5clust_eth.csv" and "Actual_Predicted_3day_4clust_eth.csv"

In [18]:
#Actual vs Predicted value comparison
#1 day 5 clusters

actual = ethereum_1day_5clusters['DailyReturn'].tolist()
predicted = pred.tolist()
actual_predicted = list(zip(actual, predicted))

actual_predicted = pd.DataFrame.from_records(actual_predicted, columns = ["Actual Return", "Predicted Return"])
actual_predicted.to_csv("Actual_Predicted_1day_5clust_eth.csv", encoding='utf-8')

#1 day 4 clusters

actual = ethereum_1day_4clusters['DailyReturn'].tolist()
predicted = pred.tolist()
actual_predicted = list(zip(actual, predicted))

actual_predicted = pd.DataFrame.from_records(actual_predicted, columns = ["Actual Return", "Predicted Return"])
actual_predicted.to_csv("Actual_Predicted_1day_4clust_eth.csv", encoding='utf-8')

#3 day 5 clusters

actual = ethereum_3day_5clusters['DailyReturn'].tolist()
predicted = pred.tolist()
actual_predicted = list(zip(actual, predicted))

actual_predicted = pd.DataFrame.from_records(actual_predicted, columns = ["Actual Return", "Predicted Return"])
actual_predicted.to_csv("Actual_Predicted_3day_5clust_eth.csv", encoding='utf-8')

#3 day 4 clusters

actual = ethereum_3day_4clusters['DailyReturn'].tolist()
predicted = pred.tolist()
actual_predicted = list(zip(actual, predicted))

actual_predicted = pd.DataFrame.from_records(actual_predicted, columns = ["Actual Return", "Predicted Return"])
actual_predicted.to_csv("Actual_Predicted_3day_4clust_eth.csv", encoding='utf-8')