In [1]:
import requests      
from datetime import datetime
from bs4 import BeautifulSoup  
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
import re, csv, string 
import gensim
import pandas as pd
from gensim import corpora, models
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np
import dateutil.parser
from nltk.sentiment.vader import SentimentIntensityAnalyzer

  from pandas.core import datetools


# Defined functions to scrape Litecoin news from litecoinnews.io and to fetch headlines and date to be used later in sentiment analysis

In [2]:
#Separate headlines out from 'all_data_raw' variable so that it can be written to separate columns in csv

def get_headlines(data, index):
    
    headlines = []
    
    for row in data:
        headlines.append(row[index])
        
    return headlines

#Separate dates out from 'all_data' variable so that it can be written to separate columns in csv

def get_date(data, index):
    
    date = []
    
    for row in data:
        date.append(row[index])
        
    return date

#Handle cleaning of one specific date format

def tokenize_date(t):
    
    pattern=r'[a-zA-Z{3}]+[.\s]+[\d{1,2}\,\s]+[\d{4}]+' 
    
    date_pattern = nltk.regexp_tokenize(t, pattern)
    
    return date_pattern

#Get Litecoin news from litecoinnews.io

def get_litecoinNews():
    
    headlines=[]  #List to store headlines
    dates = []    #List to store date
    raw_headlines = []
    page_number = 1
    page_url="https://litecoinnews.io/"
    raw_data=[]
    final=[]

    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
    
    while page_url!=None:
        
        page = requests.get(page_url,headers=headers) 
        
        if page.status_code!=200:    
            page_url=None
        else:                       
            page = requests.get(page_url,headers=headers) 
            soup = BeautifulSoup(page.content, 'html.parser')
            divs = soup.findAll("div", {"class": "main-post"})

            for idx, div in enumerate(divs):
                
                headline=None  
                date=None
                
                headline=div.select("h5.post-title")[0].get_text()  
                
                date=div.select("time")[0].get_text()
                date=str(dateutil.parser.parse(date).date())
                date=datetime.strptime(date,"%Y-%m-%d").strftime("%m/%d/%Y")
                dt_obj = datetime.strptime(date,'%m/%d/%Y')
                date=date=datetime.strftime(dt_obj,'%b %d, %Y')
                
                raw_data.append((headline.replace('\n',""),date))
                
        tag=soup.find("ul",class_="pagination")
        new=tag.find("li", {"class": "active"})
        divTag=((new).find_next_sibling("li"))
        print(page_url)
        
        if(divTag!=None):
            page_url=divTag.select("a")[0].get('href')
        else:
            page_url=None            

    return raw_data

# Calling the functions defined above to scrape the data and then store it in the list "all_data_raw"

In [3]:
if __name__ == "__main__":
    
    print("Scraping litecoin news from litecoinnews.io")
    print("*******************************************\n")
    litecoin_raw = get_litecoinNews()
    
    all_data_raw = []
    all_data_raw.extend(list(litecoin_raw))
    
    print("\nDone scraping all data and stored in list 'all_data_raw'")

Scraping litecoin news from litecoinnews.io
*******************************************

https://litecoinnews.io/
https://litecoinnews.io/?page=2
https://litecoinnews.io/?page=3
https://litecoinnews.io/?page=4
https://litecoinnews.io/?page=5
https://litecoinnews.io/?page=6
https://litecoinnews.io/?page=7
https://litecoinnews.io/?page=8
https://litecoinnews.io/?page=9
https://litecoinnews.io/?page=10
https://litecoinnews.io/?page=11
https://litecoinnews.io/?page=12
https://litecoinnews.io/?page=13
https://litecoinnews.io/?page=14
https://litecoinnews.io/?page=15
https://litecoinnews.io/?page=16
https://litecoinnews.io/?page=17
https://litecoinnews.io/?page=18
https://litecoinnews.io/?page=19
https://litecoinnews.io/?page=20
https://litecoinnews.io/?page=21
https://litecoinnews.io/?page=22
https://litecoinnews.io/?page=23
https://litecoinnews.io/?page=24
https://litecoinnews.io/?page=25
https://litecoinnews.io/?page=26
https://litecoinnews.io/?page=27
https://litecoinnews.io/?page=28
htt

# Classifying the headlines as positive, negative and neutral and printing the results

In [4]:
#Filter out headlines without these keywords

keywords = ['litecoin', 'cryptocurrency','cryptocurrencies', 'crypto', 'blockchain', 'blockchains']

#Get headlines and dates as individual lists

headlines = get_headlines(list(filter(lambda x: any(word in x[0] for word in keywords), all_data_raw)),0)
dates = get_date(list(filter(lambda x: any(word in x[0] for word in keywords), all_data_raw)),1)
sid = SentimentIntensityAnalyzer()       
compound = []

for head in headlines:
    
    head_lower = head.lower()
    ss = sid.polarity_scores(head_lower)
    compound.append(ss['compound'])
    
#Counting the number of headlines in each sentiment class

neutral = []
somewhat_negative = []
somewhat_positive = []
very_negative = []
very_positive = []

for index, score in enumerate(compound):
    
    if score > -0.20 and score < 0.20:
        neutral.append(score)
        
    elif score > -0.60 and score < -0.20:
        somewhat_negative.append(score)
        
    elif score > 0.20 and score < 0.60:
        somewhat_positive.append(score)
        
    elif score <= -0.60:
        very_negative.append(score)
        
    else:
        very_positive.append(score)

print('Neutral headlines: ', len(neutral))
print('Somewhat negative headlines: ', len(somewhat_negative))
print("Very negative headlines: ", len(very_negative))
print('Somewhat positive headlines: ', len(somewhat_positive))
print("Very positive headlines: ", len(very_positive))
print('Total number of headlines: ', len(compound))

data_with_sentiment = list(zip(dates, headlines, compound))

Neutral headlines:  35
Somewhat negative headlines:  9
Very negative headlines:  0
Somewhat positive headlines:  11
Very positive headlines:  2
Total number of headlines:  57


# Tokenizing, lemmatizing and removing frequent keywords to improve clustering performance

In [5]:
#Tokenize and lemmatize headlines

def get_wordnet_pos(pos_tag):
    
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    
    else:
        return wordnet.NOUN
    
stop_words = stopwords.words('english')

def lemmatize(document):
    
    pattern=r'[a-zA-Z]+[a-zA-Z\-]+[a-zA-Z]'      
    tokens=nltk.regexp_tokenize(document, pattern)
    tagged_tokens=nltk.pos_tag(tokens)
    wordnet_lemmatizer=WordNetLemmatizer()
    
    le_words=[wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(tag)) \
              for (word, tag) in tagged_tokens \
              if word not in stop_words and \
              word not in string.punctuation]
    
    le_words = list(set(le_words))
    return le_words

lem_headlines = []

for headline in headlines:
    
    lem_headline = lemmatize(headline)
    lem_headlines.append(lem_headline)

lem_data_with_sentiment = list(zip(dates, headlines, lem_headlines, compound))

#Function to remove frequent keywords from lemmatized headline tokens to improve clustering performance

def remove_keywords(data):
    for row in data:
        lem_list = row[2]
        for word in lem_list:
            if word in keywords:
                lem_list.remove(word)
        
    return data

#Preparing data for LDA clustering

data_for_clustering = remove_keywords(lem_data_with_sentiment)

# Building LDA clustering model for 5 clusters and storing the data in "Cleaned_data_final_5clusters_ltc.csv"

In [6]:
#Building LDA Model for 5 clusters

headline = get_headlines(data_for_clustering,2)
dictionary = corpora.Dictionary(headline)
corpus = [dictionary.doc2bow(row) for row in headline]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word = dictionary, passes=20)
print(ldamodel.print_topics(num_topics=5, num_words=5))

#Classifying clusters and mapping them to headlines

cluster = ldamodel[corpus]
clusters = []
list_ = []

for t in cluster:
    list_ = (list(zip(*t))[1])
    clusters.append(list_.index(max(list_)))

date, headline, tokenized_headline, sentiment = zip(*data_for_clustering)
clean_data_5clusters = list(zip(date, headline, tokenized_headline, sentiment, clusters))

#Displaying the headlines within each cluster

filter(lambda x: x[1] == 0, clean_data_5clusters)
filter(lambda x: x[1] == 1, clean_data_5clusters)
filter(lambda x: x[1] == 2, clean_data_5clusters)
filter(lambda x: x[1] == 3, clean_data_5clusters)
filter(lambda x: x[1] == 4, clean_data_5clusters)

#Writing cleaned data to csv for 5 clusters

data_frame_5clusters = pd.DataFrame.from_records(clean_data_5clusters, columns = ["Date", "Headline", "Tokenized Headline", "Sentiment", "Cluster"])
data_frame_5clusters.to_csv("Cleaned_data_final_5clusters_ltc.csv", encoding='utf-8')
print("\nAll data cleaned, clustered and written to csv file 'Cleaned_data_final_5clusters_ltc.csv'")

[(0, '0.033*"market" + 0.025*"price" + 0.025*"Litecoin" + 0.017*"founder" + 0.017*"The"'), (1, '0.057*"Litecoin" + 0.050*"LTC" + 0.047*"USD" + 0.047*"price" + 0.030*"market"'), (2, '0.062*"Litecoin" + 0.032*"LTC" + 0.022*"The" + 0.022*"FOMC" + 0.022*"wait"'), (3, '0.024*"LTC" + 0.020*"Ripple" + 0.020*"exchange" + 0.020*"trading" + 0.020*"payment"'), (4, '0.040*"Litecoin" + 0.017*"add" + 0.017*"say" + 0.017*"sell" + 0.017*"week"')]

All data cleaned, clustered and written to csv file 'Cleaned_data_final_5clusters_ltc.csv'


# Building LDA clustering model for 4 clusters and storing the data in "Cleaned_data_final_4clusters_ltc.csv"

In [7]:
#Building LDA Model for 4 clusters
 
headline_list = get_headlines(data_for_clustering,2)
dictionary = corpora.Dictionary(headline_list)
corpus = [dictionary.doc2bow(row) for row in headline_list]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=4, id2word = dictionary, passes=20)
print(ldamodel.print_topics(num_topics=4, num_words=5))

#Classifying clusters and mapping them to headlines

cluster = ldamodel[corpus]
clusters = []
list_ = []

for t in cluster:
    list_ = (list(zip(*t))[1])
    clusters.append(list_.index(max(list_)))

date, headline, tokenized_headline, sentiment = zip(*data_for_clustering)
clean_data_4clusters = list(zip(date, headline, tokenized_headline, sentiment, clusters))

#Displaying the headlines within each cluster

filter(lambda x: x[1] == 0, clean_data_4clusters)
filter(lambda x: x[1] == 1, clean_data_4clusters)
filter(lambda x: x[1] == 2, clean_data_4clusters)
filter(lambda x: x[1] == 3, clean_data_4clusters)

#Writing cleaned data to csv for 4 clusters

data_frame_4clusters = pd.DataFrame.from_records(clean_data_4clusters, columns = ["Date", "Headline", "Tokenized Headline", "Sentiment", "Cluster"])
data_frame_4clusters.to_csv("Cleaned_data_final_4clusters_ltc.csv", encoding='utf-8')
print("\nAll data cleaned, clustered and written to csv file 'Cleaned_data_final_4clusters_ltc.csv'")

[(0, '0.052*"Litecoin" + 0.024*"market" + 0.023*"LTC" + 0.020*"founder" + 0.016*"The"'), (1, '0.022*"Litecoin" + 0.015*"say" + 0.015*"surge" + 0.015*"payment" + 0.015*"bitcoin"'), (2, '0.028*"bank" + 0.028*"LTC" + 0.028*"Litecoin" + 0.020*"new" + 0.019*"solution"'), (3, '0.068*"price" + 0.056*"USD" + 0.051*"Litecoin" + 0.047*"LTC" + 0.027*"market"')]

All data cleaned, clustered and written to csv file 'Cleaned_data_final_4clusters_ltc.csv'


# Generating pivot tables based on dates

In [8]:
#Generating pivot tables by date

pivot_table_5 = pd.pivot_table(data_frame_5clusters, values = 'Sentiment', index = 'Date', columns = 'Cluster', aggfunc = np.sum, fill_value = 0)
pivot_table_4 = pd.pivot_table(data_frame_4clusters, values = 'Sentiment', index = 'Date', columns = 'Cluster', aggfunc = np.sum, fill_value = 0)

pivot_table_5 = pivot_table_5.rename_axis(None, axis =1).reset_index()
pivot_table_4 = pivot_table_4.rename_axis(None, axis =1).reset_index()

# Merging pivot tables with litecoin pricing data stored in files "Litecoin1Day.csv" and "Litecoin3Day.csv". These files have data from Kaggle

In [9]:
#Merge pivot tables with bitcoin pricing data

with open("Litecoin1Day.csv", "r") as f:
    
    reader=csv.reader(f, delimiter=',') 
    litecoin_1day = [row for row in reader]
    del litecoin_1day[0]

date_list = []

for date in get_date(litecoin_1day,0):
    date = datetime.strptime(date, '%m/%d/%Y')
    date = datetime.strftime(date,'%b %d, %Y')
    date_list.append(date)
    
date, dailyreturn = zip(*litecoin_1day)
litecoin_1day = zip(date_list,dailyreturn)
    
litecoin_1day = pd.DataFrame.from_records(litecoin_1day, columns = ["Date","DailyReturn"])
litecoin_1day

litecoin_1day_5clusters = litecoin_1day.merge(pivot_table_5, on = 'Date', how = 'left')
litecoin_1day_5clusters = litecoin_1day_5clusters.dropna()

litecoin_1day_4clusters = litecoin_1day.merge(pivot_table_4, on = 'Date', how = 'left')
litecoin_1day_4clusters = litecoin_1day_4clusters.dropna()

with open("Litecoin3Day.csv", "r") as f:
    
    reader=csv.reader(f, delimiter=',') 
    litecoin_3day = [row for row in reader]
    del litecoin_3day[0]

date_list = []

for date in get_date(litecoin_3day,0):
    date = datetime.strptime(date, '%m/%d/%Y')
    date = datetime.strftime(date,'%b %d, %Y')
    date_list.append(date)
    
date, dailyreturn = zip(*litecoin_3day)
litecoin_3day = zip(date_list,dailyreturn)
    
litecoin_3day = pd.DataFrame.from_records(litecoin_3day, columns = ["Date","DailyReturn"])

litecoin_3day_5clusters = litecoin_3day.merge(pivot_table_5, on = 'Date', how = 'left')
litecoin_3day_5clusters = litecoin_3day_5clusters.dropna()

litecoin_3day_4clusters = litecoin_3day.merge(pivot_table_4, on = 'Date', how = 'left')
litecoin_3day_4clusters = litecoin_3day_4clusters.dropna()

# Performing Linear Regression

In [10]:
#1 day 5 clusters

x_train = litecoin_1day_5clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = litecoin_1day_5clusters['DailyReturn']

model_1_5 = sm.OLS(y_train.astype(float), x_train.astype(float))
results_1_5 = model_1_5.fit()
print(results_1_5.summary())
print('\n******************************************************************************************************\n')

#1 day 4 clusters

x_train = litecoin_1day_4clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = litecoin_1day_4clusters['DailyReturn']

model_1_4 = sm.OLS(y_train.astype(float), x_train.astype(float))
results_1_4 = model_1_4.fit()
print(results_1_4.summary())
print('\n******************************************************************************************************\n')

#3 day 5 clusters

x_train = litecoin_3day_5clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = litecoin_3day_5clusters['DailyReturn']

model_3_5 = sm.OLS(y_train.astype(float), x_train.astype(float))
results_3_5 = model_3_5.fit()
print(results_3_5.summary())
print('\n******************************************************************************************************\n')
pred = results_3_5.predict(x_train.astype(float))

#3 day 4 clusters

x_train = litecoin_3day_4clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = litecoin_3day_4clusters['DailyReturn']

model_3_4 = sm.OLS(y_train.astype(float), x_train.astype(float))
results_3_4 = model_3_4.fit()
print(results_3_4.summary())
print('\n******************************************************************************************************\n')

litecoin_1day_5clusters_binary = []

for value in litecoin_1day_5clusters['DailyReturn']:
    if float(value) > 0:
        value_binary = 1
    else:
        value_binary = 0
        
    litecoin_1day_5clusters_binary.append(value_binary)
    
litecoin_1day_4clusters_binary = []

for value in litecoin_1day_4clusters['DailyReturn']:
    if float(value) > 0:
        value_binary = 1
    else:
        value_binary = 0
        
    litecoin_1day_4clusters_binary.append(value_binary)
    
litecoin_3day_5clusters_binary = []

for value in litecoin_3day_5clusters['DailyReturn']:
    if float(value) > 0:
        value_binary = 1
    else:
        value_binary = 0
        
    litecoin_3day_5clusters_binary.append(value_binary)
    
litecoin_3day_4clusters_binary = []

for value in litecoin_3day_4clusters['DailyReturn']:
    if float(value) > 0:
        value_binary = 1
    else:
        value_binary = 0
        
    litecoin_3day_4clusters_binary.append(value_binary)

                            OLS Regression Results                            
Dep. Variable:            DailyReturn   R-squared:                       0.151
Model:                            OLS   Adj. R-squared:                 -0.006
Method:                 Least Squares   F-statistic:                    0.9599
Date:                Thu, 26 Apr 2018   Prob (F-statistic):              0.459
Time:                        23:51:59   Log-Likelihood:                -130.02
No. Observations:                  33   AIC:                             272.0
Df Residuals:                      27   BIC:                             281.0
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.2502      2.642      1.609      0.1

# Performing Logistic Regression

In [11]:
LogReg = LogisticRegression()

#1 day 5 clusters

x_train = litecoin_1day_5clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = litecoin_1day_5clusters_binary

model_1_5_log = LogReg.fit(x_train.astype(float),y_train)
y_pred = model_1_5_log.predict(x_train.astype(float))
print(classification_report(y_pred, y_train))

#1 day 4 clusters

x_train = litecoin_1day_4clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = litecoin_1day_4clusters_binary

model_1_4_log = LogReg.fit(x_train.astype(float),y_train)
y_pred = model_1_4_log.predict(x_train.astype(float))
print(classification_report(y_pred, y_train))

#3 day 5 clusters

x_train = litecoin_3day_5clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = litecoin_3day_5clusters_binary

model_3_5_log = LogReg.fit(x_train.astype(float),y_train)
y_pred = model_3_5_log.predict(x_train.astype(float))
print(classification_report(y_pred, y_train))

#3 day 4 clusters

x_train = litecoin_3day_4clusters.iloc[:,2:]
x_train = sm.add_constant(x_train)
y_train = litecoin_3day_4clusters_binary

model_3_4_log = LogReg.fit(x_train.astype(float),y_train)
y_pred = model_3_4_log.predict(x_train.astype(float))
print(classification_report(y_pred, y_train))

             precision    recall  f1-score   support

          0       1.00      0.59      0.75        32
          1       0.07      1.00      0.13         1

avg / total       0.97      0.61      0.73        33

             precision    recall  f1-score   support

          0       1.00      0.59      0.75        32
          1       0.07      1.00      0.13         1

avg / total       0.97      0.61      0.73        33

             precision    recall  f1-score   support

          0       0.82      0.54      0.65        26
          1       0.25      0.57      0.35         7

avg / total       0.70      0.55      0.59        33

             precision    recall  f1-score   support

          0       0.94      0.53      0.68        30
          1       0.12      0.67      0.21         3

avg / total       0.87      0.55      0.64        33



# Outputing the actual vs predicted returns in "Actual_Predicted_1day_5clust_ltc.csv", "Actual_Predicted_1day_4clust_ltc.csv", "Actual_Predicted_3day_5clust_ltc.csv" and "Actual_Predicted_3day_4clust_ltc.csv"

In [12]:
#Actual vs Predicted value comparison
#1 day 5 clusters

actual = litecoin_1day_5clusters['DailyReturn'].tolist()
predicted = pred.tolist()
actual_predicted = list(zip(actual, predicted))

actual_predicted = pd.DataFrame.from_records(actual_predicted, columns = ["Actual Return", "Predicted Return"])
actual_predicted.to_csv("Actual_Predicted_1day_5clust_ltc.csv", encoding='utf-8')

#1 day 4 clusters

actual = litecoin_1day_4clusters['DailyReturn'].tolist()
predicted = pred.tolist()
actual_predicted = list(zip(actual, predicted))

actual_predicted = pd.DataFrame.from_records(actual_predicted, columns = ["Actual Return", "Predicted Return"])
actual_predicted.to_csv("Actual_Predicted_1day_4clust_ltc.csv", encoding='utf-8')

#3 day 5 clusters

actual = litecoin_3day_5clusters['DailyReturn'].tolist()
predicted = pred.tolist()
actual_predicted = list(zip(actual, predicted))

actual_predicted = pd.DataFrame.from_records(actual_predicted, columns = ["Actual Return", "Predicted Return"])
actual_predicted.to_csv("Actual_Predicted_3day_5clust_ltc.csv", encoding='utf-8')

#3 day 4 clusters

actual = litecoin_3day_4clusters['DailyReturn'].tolist()
predicted = pred.tolist()
actual_predicted = list(zip(actual, predicted))

actual_predicted = pd.DataFrame.from_records(actual_predicted, columns = ["Actual Return", "Predicted Return"])
actual_predicted.to_csv("Actual_Predicted_3day_4clust_ltc.csv", encoding='utf-8')